shithub: libvpx

--- a/examples.mk

+++ b/examples.mk

@@ -9,6 +9,7 @@

##

 LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \

+                third_party/libyuv/include/libyuv/compare_row.h \

                 third_party/libyuv/include/libyuv/convert.h \

                 third_party/libyuv/include/libyuv/convert_argb.h \

                 third_party/libyuv/include/libyuv/convert_from.h \

@@ -18,6 +19,7 @@

                 third_party/libyuv/include/libyuv/row.h  \

                 third_party/libyuv/include/libyuv/scale.h  \

                 third_party/libyuv/include/libyuv/scale_row.h  \

+                third_party/libyuv/include/libyuv.h \

                 third_party/libyuv/source/cpu_id.cc \

                 third_party/libyuv/source/planar_functions.cc \

                 third_party/libyuv/source/row_any.cc \

--- a/third_party/libyuv/README.libvpx

+++ b/third_party/libyuv/README.libvpx

@@ -1,6 +1,6 @@

 Name: libyuv

 URL: http://code.google.com/p/libyuv/

-Version: 1456

+Version: 1b3e4aee47

 License: BSD

 License File: LICENSE

@@ -13,3 +13,6 @@

 in order to encode multiple resolution bit streams.

 Local Modifications:

+Removed all unnecessary files including all gyp, make files, docs, unit_tests

+and tools directories

--- /dev/null

+++ b/third_party/libyuv/include/libyuv.h

@@ -1,0 +1,32 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_H_  // NOLINT

+#define INCLUDE_LIBYUV_H_

+#include "libyuv/basic_types.h"

+#include "libyuv/compare.h"

+#include "libyuv/convert.h"

+#include "libyuv/convert_argb.h"

+#include "libyuv/convert_from.h"

+#include "libyuv/convert_from_argb.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/mjpeg_decoder.h"

+#include "libyuv/planar_functions.h"

+#include "libyuv/rotate.h"

+#include "libyuv/rotate_argb.h"

+#include "libyuv/row.h"

+#include "libyuv/scale.h"

+#include "libyuv/scale_argb.h"

+#include "libyuv/scale_row.h"

+#include "libyuv/version.h"

+#include "libyuv/video_common.h"

+#endif  // INCLUDE_LIBYUV_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/compare_row.h

@@ -1,0 +1,84 @@

+/*

+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_  // NOLINT

+#define INCLUDE_LIBYUV_COMPARE_ROW_H_

+#include "libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#if defined(__pnacl__) || defined(__CLR_VER) || \

+    (defined(__i386__) && !defined(__SSE2__))

+#define LIBYUV_DISABLE_X86

+#endif

+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505

+#if defined(__has_feature)

+#if __has_feature(memory_sanitizer)

+#define LIBYUV_DISABLE_X86

+#endif

+#endif

+// Visual C 2012 required for AVX2.

+#if defined(_M_IX86) && !defined(__clang__) && \

+    defined(_MSC_VER) && _MSC_VER >= 1700

+#define VISUALC_HAS_AVX2 1

+#endif  // VisualStudio >= 2012

+// clang >= 3.4.0 required for AVX2.

+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))

+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))

+#define CLANG_HAS_AVX2 1

+#endif  // clang >= 3.4

+#endif  // __clang__

+#if !defined(LIBYUV_DISABLE_X86) && \

+    defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))

+#define HAS_HASHDJB2_AVX2

+#endif

+// The following are available for Visual C and GCC:

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86)))

+#define HAS_HASHDJB2_SSE41

+#define HAS_SUMSQUAREERROR_SSE2

+#endif

+// The following are available for Visual C and clangcl 32 bit:

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \

+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))

+#define HAS_HASHDJB2_AVX2

+#define HAS_SUMSQUAREERROR_AVX2

+#endif

+// The following are available for Neon:

+#if !defined(LIBYUV_DISABLE_NEON) && \

+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))

+#define HAS_SUMSQUAREERROR_NEON

+#endif

+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);

+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);

+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);

+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);

+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);

+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);

+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_COMPARE_ROW_H_  NOLINT

--- a/third_party/libyuv/include/libyuv/convert.h

+++ b/third_party/libyuv/include/libyuv/convert.h

@@ -12,10 +12,8 @@

 #define INCLUDE_LIBYUV_CONVERT_H_

 #include "libyuv/basic_types.h"

-// TODO(fbarchard): Remove the following headers includes.

-#include "libyuv/convert_from.h"

-#include "libyuv/planar_functions.h"

-#include "libyuv/rotate.h"

+#include "libyuv/rotate.h"  // For enum RotationMode.

 #ifdef __cplusplus

 namespace libyuv {

--- a/third_party/libyuv/include/libyuv/convert_argb.h

+++ b/third_party/libyuv/include/libyuv/convert_argb.h

@@ -12,11 +12,9 @@

 #define INCLUDE_LIBYUV_CONVERT_ARGB_H_

 #include "libyuv/basic_types.h"

-// TODO(fbarchard): Remove the following headers includes

-#include "libyuv/convert_from.h"

-#include "libyuv/planar_functions.h"

-#include "libyuv/rotate.h"

+#include "libyuv/rotate.h"  // For enum RotationMode.

 // TODO(fbarchard): This set of functions should exactly match convert.h

 // TODO(fbarchard): Add tests. Create random content of right size and convert

 // with C vs Opt and or to I420 and compare.

@@ -60,6 +58,22 @@

                uint8* dst_argb, int dst_stride_argb,

                int width, int height);

+// Convert J444 to ARGB.

+LIBYUV_API

+int J444ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert I444 to ABGR.

+LIBYUV_API

+int I444ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height);

 // Convert I411 to ARGB.

 LIBYUV_API

 int I411ToARGB(const uint8* src_y, int src_stride_y,

@@ -68,6 +82,24 @@

                uint8* dst_argb, int dst_stride_argb,

                int width, int height);

+// Convert I420 with Alpha to preattenuated ARGB.

+LIBYUV_API

+int I420AlphaToARGB(const uint8* src_y, int src_stride_y,

+                    const uint8* src_u, int src_stride_u,

+                    const uint8* src_v, int src_stride_v,

+                    const uint8* src_a, int src_stride_a,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height, int attenuate);

+// Convert I420 with Alpha to preattenuated ABGR.

+LIBYUV_API

+int I420AlphaToABGR(const uint8* src_y, int src_stride_y,

+                    const uint8* src_u, int src_stride_u,

+                    const uint8* src_v, int src_stride_v,

+                    const uint8* src_a, int src_stride_a,

+                    uint8* dst_abgr, int dst_stride_abgr,

+                    int width, int height, int attenuate);

 // Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.

 LIBYUV_API

 int I400ToARGB(const uint8* src_y, int src_stride_y,

@@ -129,6 +161,54 @@

                const uint8* src_u, int src_stride_u,

                const uint8* src_v, int src_stride_v,

                uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert J420 to ABGR.

+LIBYUV_API

+int J420ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height);

+// Convert J422 to ABGR.

+LIBYUV_API

+int J422ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height);

+// Convert H420 to ARGB.

+LIBYUV_API

+int H420ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert H422 to ARGB.

+LIBYUV_API

+int H422ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert H420 to ABGR.

+LIBYUV_API

+int H420ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height);

+// Convert H422 to ABGR.

+LIBYUV_API

+int H422ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

                int width, int height);

 // BGRA little endian (argb in memory) to ARGB.

--- a/third_party/libyuv/include/libyuv/convert_from.h

+++ b/third_party/libyuv/include/libyuv/convert_from.h

@@ -56,8 +56,6 @@

              uint8* dst_y, int dst_stride_y,

              int width, int height);

-// TODO(fbarchard): I420ToM420

 LIBYUV_API

 int I420ToNV12(const uint8* src_y, int src_stride_y,

                const uint8* src_u, int src_stride_u,

--- a/third_party/libyuv/include/libyuv/cpu_id.h

+++ b/third_party/libyuv/include/libyuv/cpu_id.h

@@ -18,9 +18,8 @@

 extern "C" {

 #endif

-// TODO(fbarchard): Consider overlapping bits for different architectures.

 // Internal flag to indicate cpuid requires initialization.

-#define kCpuInit 0x1

+static const int kCpuInitialized = 0x1;

 // These flags are only valid on ARM processors.

 static const int kCpuHasARM = 0x2;

@@ -37,12 +36,12 @@

 static const int kCpuHasAVX2 = 0x400;

 static const int kCpuHasERMS = 0x800;

 static const int kCpuHasFMA3 = 0x1000;

+static const int kCpuHasAVX3 = 0x2000;

 // 0x2000, 0x4000, 0x8000 reserved for future X86 flags.

 // These flags are only valid on MIPS processors.

 static const int kCpuHasMIPS = 0x10000;

-static const int kCpuHasMIPS_DSP = 0x20000;

-static const int kCpuHasMIPS_DSPR2 = 0x40000;

+static const int kCpuHasDSPR2 = 0x20000;

 // Internal function used to auto-init.

 LIBYUV_API

@@ -57,13 +56,13 @@

 // returns non-zero if instruction set is detected

 static __inline int TestCpuFlag(int test_flag) {

   LIBYUV_API extern int cpu_info_;

-  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;

+  return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;

 // For testing, allow CPU flags to be disabled.

 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.

 // MaskCpuFlags(-1) to enable all cpu specific optimizations.

-// MaskCpuFlags(0) to disable all cpu specific optimizations.

+// MaskCpuFlags(1) to disable all cpu specific optimizations.

 LIBYUV_API

 void MaskCpuFlags(int enable_flags);

--- a/third_party/libyuv/include/libyuv/planar_functions.h

+++ b/third_party/libyuv/include/libyuv/planar_functions.h

@@ -145,13 +145,6 @@

                  uint8* dst_rgb565, int dst_stride_rgb565,

                  int width, int height);

-// Convert NV21 to RGB565.

-LIBYUV_API

-int NV21ToRGB565(const uint8* src_y, int src_stride_y,

-                 const uint8* src_uv, int src_stride_uv,

-                 uint8* dst_rgb565, int dst_stride_rgb565,

-                 int width, int height);

 // I422ToARGB is in convert_argb.h

 // Convert I422 to BGRA.

 LIBYUV_API

@@ -177,6 +170,14 @@

                uint8* dst_rgba, int dst_stride_rgba,

                int width, int height);

+// Alias

+#define RGB24ToRAW RAWToRGB24

+LIBYUV_API

+int RAWToRGB24(const uint8* src_raw, int src_stride_raw,

+               uint8* dst_rgb24, int dst_stride_rgb24,

+               int width, int height);

 // Draw a rectangle into I420.

 LIBYUV_API

 int I420Rect(uint8* dst_y, int dst_stride_y,

@@ -281,14 +282,20 @@

              uint8* dst_argb, int dst_stride_argb,

              int width, int height);

-// Copy ARGB to ARGB.

+// Copy Alpha channel of ARGB to alpha of ARGB.

 LIBYUV_API

 int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,

                   uint8* dst_argb, int dst_stride_argb,

                   int width, int height);

-// Copy ARGB to ARGB.

+// Extract the alpha channel from ARGB.

 LIBYUV_API

+int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,

+                     uint8* dst_a, int dst_stride_a,

+                     int width, int height);

+// Copy Y channel to Alpha of ARGB.

+LIBYUV_API

 int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,

                      uint8* dst_argb, int dst_stride_argb,

                      int width, int height);

@@ -301,6 +308,7 @@

 ARGBBlendRow GetARGBBlend();

 // Alpha Blend ARGB images and store to destination.

+// Source is pre-multiplied by alpha using ARGBAttenuate.

 // Alpha of destination is set to 255.

 LIBYUV_API

 int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,

@@ -308,6 +316,31 @@

               uint8* dst_argb, int dst_stride_argb,

               int width, int height);

+// Alpha Blend plane and store to destination.

+// Source is not pre-multiplied by alpha.

+LIBYUV_API

+int BlendPlane(const uint8* src_y0, int src_stride_y0,

+               const uint8* src_y1, int src_stride_y1,

+               const uint8* alpha, int alpha_stride,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height);

+// Alpha Blend YUV images and store to destination.

+// Source is not pre-multiplied by alpha.

+// Alpha is full width x height and subsampled to half size to apply to UV.

+LIBYUV_API

+int I420Blend(const uint8* src_y0, int src_stride_y0,

+              const uint8* src_u0, int src_stride_u0,

+              const uint8* src_v0, int src_stride_v0,

+              const uint8* src_y1, int src_stride_y1,

+              const uint8* src_u1, int src_stride_u1,

+              const uint8* src_v1, int src_stride_v1,

+              const uint8* alpha, int alpha_stride,

+              uint8* dst_y, int dst_stride_y,

+              uint8* dst_u, int dst_stride_u,

+              uint8* dst_v, int dst_stride_v,

+              int width, int height);

 // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.

 LIBYUV_API

 int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,

@@ -357,12 +390,6 @@

                     uint8* dst_argb, int dst_stride_argb,

                     int width, int height);

-// Convert MJPG to ARGB.

-LIBYUV_API

-int MJPGToARGB(const uint8* sample, size_t sample_size,

-               uint8* argb, int argb_stride,

-               int w, int h, int dw, int dh);

 // Internal function - do not call directly.

 // Computes table of cumulative sum for image where the value is the sum

 // of all values above and to the left of the entry. Used by ARGBBlur.

@@ -389,21 +416,48 @@

               uint8* dst_argb, int dst_stride_argb,

               int width, int height, uint32 value);

-// Interpolate between two ARGB images using specified amount of interpolation

+// Interpolate between two images using specified amount of interpolation

 // (0 to 255) and store to destination.

-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0

-// and 255 means 1% src_argb0 and 99% src_argb1.

-// Internally uses ARGBScale bilinear filtering.

-// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.

+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0

+// and 255 means 1% src0 and 99% src1.

 LIBYUV_API

+int InterpolatePlane(const uint8* src0, int src_stride0,

+                     const uint8* src1, int src_stride1,

+                     uint8* dst, int dst_stride,

+                     int width, int height, int interpolation);

+// Interpolate between two ARGB images using specified amount of interpolation

+// Internally calls InterpolatePlane with width * 4 (bpp).

+LIBYUV_API

 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,

                     const uint8* src_argb1, int src_stride_argb1,

                     uint8* dst_argb, int dst_stride_argb,

                     int width, int height, int interpolation);

+// Interpolate between two YUV images using specified amount of interpolation

+// Internally calls InterpolatePlane on each plane where the U and V planes

+// are half width and half height.

+LIBYUV_API

+int I420Interpolate(const uint8* src0_y, int src0_stride_y,

+                    const uint8* src0_u, int src0_stride_u,

+                    const uint8* src0_v, int src0_stride_v,

+                    const uint8* src1_y, int src1_stride_y,

+                    const uint8* src1_u, int src1_stride_u,

+                    const uint8* src1_v, int src1_stride_v,

+                    uint8* dst_y, int dst_stride_y,

+                    uint8* dst_u, int dst_stride_u,

+                    uint8* dst_v, int dst_stride_v,

+                    int width, int height, int interpolation);

 #if defined(__pnacl__) || defined(__CLR_VER) || \

     (defined(__i386__) && !defined(__SSE2__))

 #define LIBYUV_DISABLE_X86

+#endif

+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505

+#if defined(__has_feature)

+#if __has_feature(memory_sanitizer)

+#define LIBYUV_DISABLE_X86

+#endif

 #endif

 // The following are available on all x86 platforms:

 #if !defined(LIBYUV_DISABLE_X86) && \

--- a/third_party/libyuv/include/libyuv/rotate_row.h

+++ b/third_party/libyuv/include/libyuv/rotate_row.h

@@ -22,53 +22,24 @@

     (defined(__i386__) && !defined(__SSE2__))

 #define LIBYUV_DISABLE_X86

 #endif

-// Visual C 2012 required for AVX2.

-#if defined(_M_IX86) && !defined(__clang__) && \

-    defined(_MSC_VER) && _MSC_VER >= 1700

-#define VISUALC_HAS_AVX2 1

-#endif  // VisualStudio >= 2012

-// TODO(fbarchard): switch to standard form of inline; fails on clangcl.

-#if !defined(LIBYUV_DISABLE_X86) && \

-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

-#if defined(__APPLE__) && defined(__i386__)

-#define DECLARE_FUNCTION(name)                                                 \

-    ".text                                     \n"                             \

-    ".private_extern _" #name "                \n"                             \

-    ".align 4,0x90                             \n"                             \

-"_" #name ":                                   \n"

-#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)

-#define DECLARE_FUNCTION(name)                                                 \

-    ".text                                     \n"                             \

-    ".align 4,0x90                             \n"                             \

-"_" #name ":                                   \n"

-#else

-#define DECLARE_FUNCTION(name)                                                 \

-    ".text                                     \n"                             \

-    ".align 4,0x90                             \n"                             \

-#name ":                                       \n"

+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505

+#if defined(__has_feature)

+#if __has_feature(memory_sanitizer)

+#define LIBYUV_DISABLE_X86

 #endif

 #endif

-// The following are available for Visual C:

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \

-    defined(_MSC_VER) && !defined(__clang__)

+// The following are available for Visual C and clangcl 32 bit:

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

 #define HAS_TRANSPOSEWX8_SSSE3

 #define HAS_TRANSPOSEUVWX8_SSE2

 #endif

-// The following are available for GCC but not NaCL:

+// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:

 #if !defined(LIBYUV_DISABLE_X86) && \

     (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))

 #define HAS_TRANSPOSEWX8_SSSE3

 #endif

-// The following are available for 32 bit GCC:

-#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)

-#define HAS_TRANSPOSEUVWX8_SSE2

-#endif

 // The following are available for 64 bit GCC but not NaCL:

 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \

     defined(__x86_64__)

@@ -85,8 +56,8 @@

 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \

     defined(__mips__) && \

     defined(__mips_dsp) && (__mips_dsp_rev >= 2)

-#define HAS_TRANSPOSEWX8_MIPS_DSPR2

-#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2

+#define HAS_TRANSPOSEWX8_DSPR2

+#define HAS_TRANSPOSEUVWX8_DSPR2

 #endif  // defined(__mips__)

 void TransposeWxH_C(const uint8* src, int src_stride,

@@ -100,7 +71,9 @@

                         uint8* dst, int dst_stride, int width);

 void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,

                              uint8* dst, int dst_stride, int width);

-void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,

+void TransposeWx8_DSPR2(const uint8* src, int src_stride,

+                        uint8* dst, int dst_stride, int width);

+void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,

                              uint8* dst, int dst_stride, int width);

 void TransposeWx8_Any_NEON(const uint8* src, int src_stride,

@@ -109,8 +82,8 @@

                             uint8* dst, int dst_stride, int width);

 void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,

                                  uint8* dst, int dst_stride, int width);

-void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,

-                                 uint8* dst, int dst_stride, int width);

+void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,

+                            uint8* dst, int dst_stride, int width);

 void TransposeUVWxH_C(const uint8* src, int src_stride,

                       uint8* dst_a, int dst_stride_a,

@@ -126,9 +99,19 @@

 void TransposeUVWx8_NEON(const uint8* src, int src_stride,

                          uint8* dst_a, int dst_stride_a,

                          uint8* dst_b, int dst_stride_b, int width);

-void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,

-                               uint8* dst_a, int dst_stride_a,

-                               uint8* dst_b, int dst_stride_b, int width);

+void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,

+                          uint8* dst_a, int dst_stride_a,

+                          uint8* dst_b, int dst_stride_b, int width);

+void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,

+                             uint8* dst_a, int dst_stride_a,

+                             uint8* dst_b, int dst_stride_b, int width);

+void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,

+                             uint8* dst_a, int dst_stride_a,

+                             uint8* dst_b, int dst_stride_b, int width);

+void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,

+                              uint8* dst_a, int dst_stride_a,

+                              uint8* dst_b, int dst_stride_b, int width);

 #ifdef __cplusplus

 }  // extern "C"

--- a/third_party/libyuv/include/libyuv/row.h

+++ b/third_party/libyuv/include/libyuv/row.h

@@ -41,6 +41,12 @@

     (defined(__i386__) && !defined(__SSE2__))

 #define LIBYUV_DISABLE_X86

 #endif

+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505

+#if defined(__has_feature)

+#if __has_feature(memory_sanitizer)

+#define LIBYUV_DISABLE_X86

+#endif

+#endif

 // True if compiling for SSSE3 as a requirement.

 #if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))

 #define LIBYUV_SSSE3_ONLY

@@ -56,6 +62,26 @@

 #endif  // clang >= 3.5

 #endif  // __clang__

+// GCC >= 4.7.0 required for AVX2.

+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))

+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))

+#define GCC_HAS_AVX2 1

+#endif  // GNUC >= 4.7

+#endif  // __GNUC__

+// clang >= 3.4.0 required for AVX2.

+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))

+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))

+#define CLANG_HAS_AVX2 1

+#endif  // clang >= 3.4

+#endif  // __clang__

+// Visual C 2012 required for AVX2.

+#if defined(_M_IX86) && !defined(__clang__) && \

+    defined(_MSC_VER) && _MSC_VER >= 1700

+#define VISUALC_HAS_AVX2 1

+#endif  // VisualStudio >= 2012

 // The following are available on all x86 platforms:

 #if !defined(LIBYUV_DISABLE_X86) && \

     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

@@ -71,25 +97,23 @@

 #define HAS_ARGBTOARGB4444ROW_SSE2

 #define HAS_ARGBTORAWROW_SSSE3

 #define HAS_ARGBTORGB24ROW_SSSE3

+#define HAS_ARGBTORGB565DITHERROW_SSE2

 #define HAS_ARGBTORGB565ROW_SSE2

-#define HAS_ARGBTOUV422ROW_SSSE3

 #define HAS_ARGBTOUV444ROW_SSSE3

 #define HAS_ARGBTOUVJROW_SSSE3

 #define HAS_ARGBTOUVROW_SSSE3

 #define HAS_ARGBTOYJROW_SSSE3

 #define HAS_ARGBTOYROW_SSSE3

+#define HAS_ARGBEXTRACTALPHAROW_SSE2

 #define HAS_BGRATOUVROW_SSSE3

 #define HAS_BGRATOYROW_SSSE3

 #define HAS_COPYROW_ERMS

 #define HAS_COPYROW_SSE2

+#define HAS_H422TOARGBROW_SSSE3

 #define HAS_I400TOARGBROW_SSE2

-#define HAS_I411TOARGBROW_SSSE3

-#define HAS_I422TOABGRROW_SSSE3

 #define HAS_I422TOARGB1555ROW_SSSE3

 #define HAS_I422TOARGB4444ROW_SSSE3

 #define HAS_I422TOARGBROW_SSSE3

-#define HAS_I422TOBGRAROW_SSSE3

-#define HAS_I422TORAWROW_SSSE3

 #define HAS_I422TORGB24ROW_SSSE3

 #define HAS_I422TORGB565ROW_SSSE3

 #define HAS_I422TORGBAROW_SSSE3

@@ -99,15 +123,13 @@

 #define HAS_J400TOARGBROW_SSE2

 #define HAS_J422TOARGBROW_SSSE3

 #define HAS_MERGEUVROW_SSE2

-#define HAS_MIRRORROW_SSE2

 #define HAS_MIRRORROW_SSSE3

-#define HAS_MIRRORROW_UV_SSSE3

 #define HAS_MIRRORUVROW_SSSE3

 #define HAS_NV12TOARGBROW_SSSE3

 #define HAS_NV12TORGB565ROW_SSSE3

 #define HAS_NV21TOARGBROW_SSSE3

-#define HAS_NV21TORGB565ROW_SSSE3

 #define HAS_RAWTOARGBROW_SSSE3

+#define HAS_RAWTORGB24ROW_SSSE3

 #define HAS_RAWTOYROW_SSSE3

 #define HAS_RGB24TOARGBROW_SSSE3

 #define HAS_RGB24TOYROW_SSSE3

@@ -145,9 +167,9 @@

 #define HAS_ARGBSHADEROW_SSE2

 #define HAS_ARGBSUBTRACTROW_SSE2

 #define HAS_ARGBUNATTENUATEROW_SSE2

+#define HAS_BLENDPLANEROW_SSSE3

 #define HAS_COMPUTECUMULATIVESUMROW_SSE2

 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2

-#define HAS_INTERPOLATEROW_SSE2

 #define HAS_INTERPOLATEROW_SSSE3

 #define HAS_RGBCOLORTABLEROW_X86

 #define HAS_SOBELROW_SSE2

@@ -155,54 +177,18 @@

 #define HAS_SOBELXROW_SSE2

 #define HAS_SOBELXYROW_SSE2

 #define HAS_SOBELYROW_SSE2

-#endif

-// The following are available on x64 Visual C and clangcl.

-#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \

-    (!defined(__clang__) || defined(__SSSE3__))

-#define HAS_I422TOARGBROW_SSSE3

+// The following functions fail on gcc/clang 32 bit with fpic and framepointer.

+// caveat: clangcl uses row_win.cc which works.

+#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \

+    !defined(__i386__) || defined(_MSC_VER)

+// TODO(fbarchard): fix build error on x86 debug

+// https://code.google.com/p/libyuv/issues/detail?id=524

+#define HAS_I411TOARGBROW_SSSE3

+// TODO(fbarchard): fix build error on android_full_debug=1

+// https://code.google.com/p/libyuv/issues/detail?id=517

+#define HAS_I422ALPHATOARGBROW_SSSE3

 #endif

-// GCC >= 4.7.0 required for AVX2.

-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))

-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))

-#define GCC_HAS_AVX2 1

-#endif  // GNUC >= 4.7

-#endif  // __GNUC__

-// clang >= 3.4.0 required for AVX2.

-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))

-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))

-#define CLANG_HAS_AVX2 1

-#endif  // clang >= 3.4

-#endif  // __clang__

-// Visual C 2012 required for AVX2.

-#if defined(_M_IX86) && !defined(__clang__) && \

-    defined(_MSC_VER) && _MSC_VER >= 1700

-#define VISUALC_HAS_AVX2 1

-#endif  // VisualStudio >= 2012

-// The following are available require VS2012.  Port to GCC.

-#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)

-#define HAS_ARGB1555TOARGBROW_AVX2

-#define HAS_ARGB4444TOARGBROW_AVX2

-#define HAS_ARGBTOARGB1555ROW_AVX2

-#define HAS_ARGBTOARGB4444ROW_AVX2

-#define HAS_ARGBTORGB565DITHERROW_AVX2

-#define HAS_ARGBTORGB565DITHERROW_SSE2

-#define HAS_ARGBTORGB565ROW_AVX2

-#define HAS_I411TOARGBROW_AVX2

-#define HAS_I422TOARGB1555ROW_AVX2

-#define HAS_I422TOARGB4444ROW_AVX2

-#define HAS_I422TORGB565ROW_AVX2

-#define HAS_I444TOARGBROW_AVX2

-#define HAS_J400TOARGBROW_AVX2

-#define HAS_NV12TOARGBROW_AVX2

-#define HAS_NV12TORGB565ROW_AVX2

-#define HAS_NV21TOARGBROW_AVX2

-#define HAS_NV21TORGB565ROW_AVX2

-#define HAS_RGB565TOARGBROW_AVX2

 #endif

 // The following are available on all x86 platforms, but

@@ -215,21 +201,34 @@

 #define HAS_ARGBMIRRORROW_AVX2

 #define HAS_ARGBPOLYNOMIALROW_AVX2

 #define HAS_ARGBSHUFFLEROW_AVX2

+#define HAS_ARGBTORGB565DITHERROW_AVX2

+#define HAS_ARGBTOUVJROW_AVX2

 #define HAS_ARGBTOUVROW_AVX2

 #define HAS_ARGBTOYJROW_AVX2

 #define HAS_ARGBTOYROW_AVX2

 #define HAS_COPYROW_AVX

+#define HAS_H422TOARGBROW_AVX2

 #define HAS_I400TOARGBROW_AVX2

-#define HAS_I422TOABGRROW_AVX2

+#if !(defined(_DEBUG) && defined(__i386__))

+// TODO(fbarchard): fix build error on android_full_debug=1

+// https://code.google.com/p/libyuv/issues/detail?id=517

+#define HAS_I422ALPHATOARGBROW_AVX2

+#endif

+#define HAS_I411TOARGBROW_AVX2

+#define HAS_I422TOARGB1555ROW_AVX2

+#define HAS_I422TOARGB4444ROW_AVX2

 #define HAS_I422TOARGBROW_AVX2

-#define HAS_I422TOBGRAROW_AVX2

-#define HAS_I422TORAWROW_AVX2

 #define HAS_I422TORGB24ROW_AVX2

+#define HAS_I422TORGB565ROW_AVX2

 #define HAS_I422TORGBAROW_AVX2

+#define HAS_I444TOARGBROW_AVX2

 #define HAS_INTERPOLATEROW_AVX2

 #define HAS_J422TOARGBROW_AVX2

 #define HAS_MERGEUVROW_AVX2

 #define HAS_MIRRORROW_AVX2

+#define HAS_NV12TOARGBROW_AVX2

+#define HAS_NV12TORGB565ROW_AVX2

+#define HAS_NV21TOARGBROW_AVX2

 #define HAS_SPLITUVROW_AVX2

 #define HAS_UYVYTOARGBROW_AVX2

 #define HAS_UYVYTOUV422ROW_AVX2

@@ -246,17 +245,29 @@

 #define HAS_ARGBMULTIPLYROW_AVX2

 #define HAS_ARGBSUBTRACTROW_AVX2

 #define HAS_ARGBUNATTENUATEROW_AVX2

+#define HAS_BLENDPLANEROW_AVX2

 #endif

-// The following are disabled when SSSE3 is available:

-#if !defined(LIBYUV_DISABLE_X86) && \

-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \

-    !defined(LIBYUV_SSSE3_ONLY)

-#define HAS_ARGBATTENUATEROW_SSE2

-#define HAS_ARGBBLENDROW_SSE2

-#define HAS_MIRRORROW_SSE2

+// The following are available for AVX2 Visual C and clangcl 32 bit:

+// TODO(fbarchard): Port to gcc.

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \

+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))

+#define HAS_ARGB1555TOARGBROW_AVX2

+#define HAS_ARGB4444TOARGBROW_AVX2

+#define HAS_ARGBTOARGB1555ROW_AVX2

+#define HAS_ARGBTOARGB4444ROW_AVX2

+#define HAS_ARGBTORGB565ROW_AVX2

+#define HAS_J400TOARGBROW_AVX2

+#define HAS_RGB565TOARGBROW_AVX2

 #endif

+// The following are also available on x64 Visual C.

+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \

+    (!defined(__clang__) || defined(__SSSE3__))

+#define HAS_I422ALPHATOARGBROW_SSSE3

+#define HAS_I422TOARGBROW_SSSE3

+#endif

 // The following are available on Neon platforms:

 #if !defined(LIBYUV_DISABLE_NEON) && \

     (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))

@@ -268,29 +279,29 @@

 #define HAS_ARGB4444TOARGBROW_NEON

 #define HAS_ARGB4444TOUVROW_NEON

 #define HAS_ARGB4444TOYROW_NEON

+#define HAS_ARGBSETROW_NEON

 #define HAS_ARGBTOARGB1555ROW_NEON

 #define HAS_ARGBTOARGB4444ROW_NEON

 #define HAS_ARGBTORAWROW_NEON

 #define HAS_ARGBTORGB24ROW_NEON

+#define HAS_ARGBTORGB565DITHERROW_NEON

 #define HAS_ARGBTORGB565ROW_NEON

 #define HAS_ARGBTOUV411ROW_NEON

-#define HAS_ARGBTOUV422ROW_NEON

 #define HAS_ARGBTOUV444ROW_NEON

 #define HAS_ARGBTOUVJROW_NEON

 #define HAS_ARGBTOUVROW_NEON

 #define HAS_ARGBTOYJROW_NEON

 #define HAS_ARGBTOYROW_NEON

+#define HAS_ARGBEXTRACTALPHAROW_NEON

 #define HAS_BGRATOUVROW_NEON

 #define HAS_BGRATOYROW_NEON

 #define HAS_COPYROW_NEON

-#define HAS_J400TOARGBROW_NEON

+#define HAS_I400TOARGBROW_NEON

 #define HAS_I411TOARGBROW_NEON

-#define HAS_I422TOABGRROW_NEON

+#define HAS_I422ALPHATOARGBROW_NEON

 #define HAS_I422TOARGB1555ROW_NEON

 #define HAS_I422TOARGB4444ROW_NEON

 #define HAS_I422TOARGBROW_NEON

-#define HAS_I422TOBGRAROW_NEON

-#define HAS_I422TORAWROW_NEON

 #define HAS_I422TORGB24ROW_NEON

 #define HAS_I422TORGB565ROW_NEON

 #define HAS_I422TORGBAROW_NEON

@@ -297,6 +308,7 @@

 #define HAS_I422TOUYVYROW_NEON

 #define HAS_I422TOYUY2ROW_NEON

 #define HAS_I444TOARGBROW_NEON

+#define HAS_J400TOARGBROW_NEON

 #define HAS_MERGEUVROW_NEON

 #define HAS_MIRRORROW_NEON

 #define HAS_MIRRORUVROW_NEON

@@ -303,8 +315,8 @@

 #define HAS_NV12TOARGBROW_NEON

 #define HAS_NV12TORGB565ROW_NEON

 #define HAS_NV21TOARGBROW_NEON

-#define HAS_NV21TORGB565ROW_NEON

 #define HAS_RAWTOARGBROW_NEON

+#define HAS_RAWTORGB24ROW_NEON

 #define HAS_RAWTOUVROW_NEON

 #define HAS_RAWTOYROW_NEON

 #define HAS_RGB24TOARGBROW_NEON

@@ -316,23 +328,21 @@

 #define HAS_RGBATOUVROW_NEON

 #define HAS_RGBATOYROW_NEON

 #define HAS_SETROW_NEON

-#define HAS_ARGBSETROW_NEON

 #define HAS_SPLITUVROW_NEON

 #define HAS_UYVYTOARGBROW_NEON

 #define HAS_UYVYTOUV422ROW_NEON

 #define HAS_UYVYTOUVROW_NEON

 #define HAS_UYVYTOYROW_NEON

-#define HAS_I400TOARGBROW_NEON

 #define HAS_YUY2TOARGBROW_NEON

 #define HAS_YUY2TOUV422ROW_NEON

 #define HAS_YUY2TOUVROW_NEON

 #define HAS_YUY2TOYROW_NEON

-#define HAS_ARGBTORGB565DITHERROW_NEON

 // Effects:

 #define HAS_ARGBADDROW_NEON

 #define HAS_ARGBATTENUATEROW_NEON

 #define HAS_ARGBBLENDROW_NEON

+#define HAS_ARGBCOLORMATRIXROW_NEON

 #define HAS_ARGBGRAYROW_NEON

 #define HAS_ARGBMIRRORROW_NEON

 #define HAS_ARGBMULTIPLYROW_NEON

@@ -339,6 +349,7 @@

 #define HAS_ARGBQUANTIZEROW_NEON

 #define HAS_ARGBSEPIAROW_NEON

 #define HAS_ARGBSHADEROW_NEON

+#define HAS_ARGBSHUFFLEROW_NEON

 #define HAS_ARGBSUBTRACTROW_NEON

 #define HAS_INTERPOLATEROW_NEON

 #define HAS_SOBELROW_NEON

@@ -346,8 +357,6 @@

 #define HAS_SOBELXROW_NEON

 #define HAS_SOBELXYROW_NEON

 #define HAS_SOBELYROW_NEON

-#define HAS_ARGBCOLORMATRIXROW_NEON

-#define HAS_ARGBSHUFFLEROW_NEON

 #endif

 // The following are available on Mips platforms:

@@ -355,17 +364,15 @@

     (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)

 #define HAS_COPYROW_MIPS

 #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)

-#define HAS_I422TOABGRROW_MIPS_DSPR2

-#define HAS_I422TOARGBROW_MIPS_DSPR2

-#define HAS_I422TOBGRAROW_MIPS_DSPR2

-#define HAS_INTERPOLATEROW_MIPS_DSPR2

-#define HAS_MIRRORROW_MIPS_DSPR2

-#define HAS_MIRRORUVROW_MIPS_DSPR2

-#define HAS_SPLITUVROW_MIPS_DSPR2

+#define HAS_I422TOARGBROW_DSPR2

+#define HAS_INTERPOLATEROW_DSPR2

+#define HAS_MIRRORROW_DSPR2

+#define HAS_MIRRORUVROW_DSPR2

+#define HAS_SPLITUVROW_DSPR2

 #endif

 #endif

-#if defined(_MSC_VER) && !defined(__CLR_VER)

+#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)

 #define SIMD_ALIGNED(var) __declspec(align(16)) var

 #define SIMD_ALIGNED32(var) __declspec(align(64)) var

 typedef __declspec(align(16)) int16 vec16[8];

@@ -380,7 +387,7 @@

 typedef __declspec(align(32)) uint16 ulvec16[16];

 typedef __declspec(align(32)) uint32 ulvec32[8];

 typedef __declspec(align(32)) uint8 ulvec8[32];

-#elif defined(__GNUC__)

+#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))

 // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.

 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))

 #define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))

@@ -413,6 +420,56 @@

 typedef uint8 ulvec8[32];

 #endif

+#if defined(__aarch64__)

+// This struct is for Arm64 color conversion.

+struct YuvConstants {

+  uvec16 kUVToRB;

+  uvec16 kUVToRB2;

+  uvec16 kUVToG;

+  uvec16 kUVToG2;

+  vec16 kUVBiasBGR;

+  vec32 kYToRgb;

+};

+#elif defined(__arm__)

+// This struct is for ArmV7 color conversion.

+struct YuvConstants {

+  uvec8 kUVToRB;

+  uvec8 kUVToG;

+  vec16 kUVBiasBGR;

+  vec32 kYToRgb;

+};

+#else

+// This struct is for Intel color conversion.

+struct YuvConstants {

+  lvec8 kUVToB;

+  lvec8 kUVToG;

+  lvec8 kUVToR;

+  lvec16 kUVBiasB;

+  lvec16 kUVBiasG;

+  lvec16 kUVBiasR;

+  lvec16 kYToRgb;

+};

+// Offsets into YuvConstants structure

+#define KUVTOB   0

+#define KUVTOG   32

+#define KUVTOR   64

+#define KUVBIASB 96

+#define KUVBIASG 128

+#define KUVBIASR 160

+#define KYTORGB  192

+#endif

+// Conversion matrix for YUV to RGB

+extern const struct YuvConstants kYuvI601Constants;  // BT.601

+extern const struct YuvConstants kYuvJPEGConstants;  // JPeg color space

+extern const struct YuvConstants kYuvH709Constants;  // BT.709

+// Conversion matrix for YVU to BGR

+extern const struct YuvConstants kYvuI601Constants;  // BT.601

+extern const struct YuvConstants kYvuJPEGConstants;  // JPeg color space

+extern const struct YuvConstants kYvuH709Constants;  // BT.709

 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)

 #define OMITFP

 #else

@@ -502,159 +559,166 @@

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

 void I422ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

-void I411ToARGBRow_NEON(const uint8* src_y,

+void I422AlphaToARGBRow_NEON(const uint8* y_buf,

+                             const uint8* u_buf,

+                             const uint8* v_buf,

+                             const uint8* a_buf,

+                             uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void I422ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

-void I422ToBGRARow_NEON(const uint8* src_y,

+void I411ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

-                        uint8* dst_bgra,

+                        uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

-void I422ToABGRRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_abgr,

-                        int width);

 void I422ToRGBARow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_rgba,

+                        const struct YuvConstants* yuvconstants,

                         int width);

 void I422ToRGB24Row_NEON(const uint8* src_y,

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

                          int width);

-void I422ToRAWRow_NEON(const uint8* src_y,

-                       const uint8* src_u,

-                       const uint8* src_v,

-                       uint8* dst_raw,

-                       int width);

 void I422ToRGB565Row_NEON(const uint8* src_y,

                           const uint8* src_u,

                           const uint8* src_v,

                           uint8* dst_rgb565,

+                          const struct YuvConstants* yuvconstants,

                           int width);

 void I422ToARGB1555Row_NEON(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb1555,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void I422ToARGB4444Row_NEON(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb4444,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void NV12ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_uv,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

-void NV21ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_vu,

-                        uint8* dst_argb,

-                        int width);

 void NV12ToRGB565Row_NEON(const uint8* src_y,

                           const uint8* src_uv,

                           uint8* dst_rgb565,

+                          const struct YuvConstants* yuvconstants,

                           int width);

-void NV21ToRGB565Row_NEON(const uint8* src_y,

-                          const uint8* src_vu,

-                          uint8* dst_rgb565,

-                          int width);

+void NV21ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_vu,

+                        uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

+                        int width);

 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

 void UYVYToARGBRow_NEON(const uint8* src_uyvy,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);

-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);

-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);

-void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);

-void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);

-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);

+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);

+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);

+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);

+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);

+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);

+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);

+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);

+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);

+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);

+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);

+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);

+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);

 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int pix);

-void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int pix);

+                         int width);

 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int pix);

+                         int width);

 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int pix);

+                       uint8* dst_u, uint8* dst_v, int width);

 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,

-                       uint8* dst_u, uint8* dst_v, int pix);

+                       uint8* dst_u, uint8* dst_v, int width);

 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,

-                     uint8* dst_u, uint8* dst_v, int pix);

+                     uint8* dst_u, uint8* dst_v, int width);

 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,

-                        uint8* dst_u, uint8* dst_v, int pix);

+                        uint8* dst_u, uint8* dst_v, int width);

 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,

-                          uint8* dst_u, uint8* dst_v, int pix);

-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);

-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);

-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);

-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);

-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix);

-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);

-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);

-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);

-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix);

-void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix);

-void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix);

-void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix);

-void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix);

-void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix);

-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix);

-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix);

-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix);

-void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

-void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);

-void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);

-void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);

-void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);

-void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);

-void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);

-void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix);

-void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix);

-void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix);

-void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);

-void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix);

-void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);

-void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);

-void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);

+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);

+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);

+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width);

+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);

+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);

+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);

+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);

+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);

+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);

+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);

+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width);

+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width);

+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width);

+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width);

+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width);

+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width);

+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width);

+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);

+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);

+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);

+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);

+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);

+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);

+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width);

+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);

+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);

+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width);

+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width);

+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);

+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);

+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);

+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);

+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y,

+                             int width);

+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y,

+                             int width);

 void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,

                       uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,

-                          uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,

                        uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,

@@ -665,6 +729,10 @@

                        uint8* dst_u, uint8* dst_v, int width);

 void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,

                        uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,

+                          uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,

+                           uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,

                            uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,

@@ -676,33 +744,31 @@

 void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,

                            uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                             int pix);

-void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                             int pix);

+                             int width);

 void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                             int pix);

+                             int width);

 void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,

-                           uint8* dst_u, uint8* dst_v, int pix);

+                           uint8* dst_u, uint8* dst_v, int width);

 void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,

-                           uint8* dst_u, uint8* dst_v, int pix);

+                           uint8* dst_u, uint8* dst_v, int width);

 void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,

-                         uint8* dst_u, uint8* dst_v, int pix);

+                         uint8* dst_u, uint8* dst_v, int width);

 void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,

-                            uint8* dst_u, uint8* dst_v, int pix);

+                            uint8* dst_u, uint8* dst_v, int width);

 void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,

                               int src_stride_argb1555,

-                              uint8* dst_u, uint8* dst_v, int pix);

+                              uint8* dst_u, uint8* dst_v, int width);

 void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,

                               int src_stride_argb4444,

-                              uint8* dst_u, uint8* dst_v, int pix);

+                              uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,

                    uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,

@@ -729,25 +795,15 @@

 void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,

                               uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUV422Row_SSSE3(const uint8* src_argb,

-                          uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,

-                              uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUV444Row_C(const uint8* src_argb,

                       uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUV422Row_C(const uint8* src_argb,

-                      uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUV411Row_C(const uint8* src_argb,

                       uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVJ422Row_C(const uint8* src_argb,

-                       uint8* dst_u, uint8* dst_v, int width);

 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);

 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);

-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);

 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);

-void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);

+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);

 void MirrorRow_C(const uint8* src, uint8* dst, int width);

 void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);

 void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);

@@ -758,10 +814,9 @@

                        int width);

 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

                       int width);

-void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                            int width);

-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                   int width);

+void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                       int width);

+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);

 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);

 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);

@@ -771,20 +826,23 @@

 void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);

 void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);

-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);

-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);

-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);

-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);

-void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                           int pix);

+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);

+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width);

+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width);

+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width);

+void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                      int width);

 void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                         int pix);

+                         int width);

 void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                         int pix);

+                         int width);

 void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                         int pix);

-void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                               int pix);

+                         int width);

+void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                          int width);

 void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

                   int width);

@@ -816,10 +874,26 @@

 void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);

 void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);

 void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,

+                               int width);

+void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,

+                               int width);

+void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);

+void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);

+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);

+void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a,

+                                  int width);

+void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a,

+                                  int width);

 void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);

 void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);

 void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);

+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb,

+                                  int width);

+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb,

+                                  int width);

 void SetRow_C(uint8* dst, uint8 v8, int count);

 void SetRow_X86(uint8* dst, uint8 v8, int count);

@@ -835,524 +909,541 @@

 // ARGBShufflers for BGRAToARGB etc.

 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,

-                      const uint8* shuffler, int pix);

+                      const uint8* shuffler, int width);

 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int pix);

+                         const uint8* shuffler, int width);

 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                          const uint8* shuffler, int pix);

+                          const uint8* shuffler, int width);

 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int pix);

+                         const uint8* shuffler, int width);

 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int pix);

+                         const uint8* shuffler, int width);

 void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,

-                             const uint8* shuffler, int pix);

+                             const uint8* shuffler, int width);

 void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                              const uint8* shuffler, int pix);

+                              const uint8* shuffler, int width);

 void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,

-                             const uint8* shuffler, int pix);

+                             const uint8* shuffler, int width);

 void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,

-                             const uint8* shuffler, int pix);

+                             const uint8* shuffler, int width);

-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);

-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);

-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);

+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);

+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);

+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);

+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);

 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,

-                            int pix);

+                            int width);

 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,

-                            int pix);

-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);

+                            int width);

+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);

 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,

-                            int pix);

+                            int width);

 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,

-                            int pix);

+                            int width);

-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);

-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);

-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);

+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);

+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);

+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);

+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);

 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,

-                            int pix);

+                            int width);

 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,

-                            int pix);

-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);

-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);

-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);

-void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);

-void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);

-void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);

-void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);

+                            int width);

+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);

+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);

+void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);

+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);

+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);

+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb,

+                              int width);

+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);

+void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);

 void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,

-                              int pix);

+                              int width);

 void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,

-                                int pix);

+                                int width);

 void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,

-                                int pix);

+                                int width);

 void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,

-                              int pix);

+                              int width);

 void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,

-                                int pix);

+                                int width);

 void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,

-                                int pix);

+                                int width);

-void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);

-void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);

+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb,

+                             int width);

+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);

+void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);

 void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,

-                              int pix);

+                              int width);

 void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,

-                                int pix);

+                                int width);

 void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,

-                                int pix);

+                                int width);

-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);

 void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,

-                             const uint32 dither4, int pix);

+                             const uint32 dither4, int width);

 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int pix);

+                                const uint32 dither4, int width);

 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int pix);

+                                const uint32 dither4, int width);

-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,

                                 const uint32 dither4, int width);

-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);

-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);

-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);

-void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);

-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);

-void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);

-void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);

-void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);

+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);

+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);

+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);

+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);

+void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);

+void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);

+void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);

 void I444ToARGBRow_C(const uint8* src_y,

                      const uint8* src_u,

                      const uint8* src_v,

                      uint8* dst_argb,

+                     const struct YuvConstants* yuvconstants,

                      int width);

 void I422ToARGBRow_C(const uint8* src_y,

                      const uint8* src_u,

                      const uint8* src_v,

                      uint8* dst_argb,

+                     const struct YuvConstants* yuvconstants,

                      int width);

+void I422ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_argb,

+                     const struct YuvConstants* yuvconstants,

+                     int width);

+void I422AlphaToARGBRow_C(const uint8* y_buf,

+                          const uint8* u_buf,

+                          const uint8* v_buf,

+                          const uint8* a_buf,

+                          uint8* dst_argb,

+                          const struct YuvConstants* yuvconstants,

+                          int width);

 void I411ToARGBRow_C(const uint8* src_y,

                      const uint8* src_u,

                      const uint8* src_v,

                      uint8* dst_argb,

+                     const struct YuvConstants* yuvconstants,

                      int width);

 void NV12ToARGBRow_C(const uint8* src_y,

                      const uint8* src_uv,

                      uint8* dst_argb,

+                     const struct YuvConstants* yuvconstants,

                      int width);

-void NV21ToRGB565Row_C(const uint8* src_y,

-                       const uint8* src_vu,

-                       uint8* dst_argb,

-                       int width);

 void NV12ToRGB565Row_C(const uint8* src_y,

                        const uint8* src_uv,

                        uint8* dst_argb,

+                       const struct YuvConstants* yuvconstants,

                        int width);

 void NV21ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_vu,

+                     const uint8* src_uv,

                      uint8* dst_argb,

+                     const struct YuvConstants* yuvconstants,

                      int width);

 void YUY2ToARGBRow_C(const uint8* src_yuy2,

                      uint8* dst_argb,

+                     const struct YuvConstants* yuvconstants,

                      int width);

 void UYVYToARGBRow_C(const uint8* src_uyvy,

                      uint8* dst_argb,

+                     const struct YuvConstants* yuvconstants,

                      int width);

-void J422ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_argb,

-                     int width);

-void I422ToBGRARow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_bgra,

-                     int width);

-void I422ToABGRRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_abgr,

-                     int width);

 void I422ToRGBARow_C(const uint8* src_y,

                      const uint8* src_u,

                      const uint8* src_v,

                      uint8* dst_rgba,

+                     const struct YuvConstants* yuvconstants,

                      int width);

 void I422ToRGB24Row_C(const uint8* src_y,

                       const uint8* src_u,

                       const uint8* src_v,

                       uint8* dst_rgb24,

+                      const struct YuvConstants* yuvconstants,

                       int width);

-void I422ToRAWRow_C(const uint8* src_y,

-                    const uint8* src_u,

-                    const uint8* src_v,

-                    uint8* dst_raw,

-                    int width);

 void I422ToARGB4444Row_C(const uint8* src_y,

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_argb4444,

+                         const struct YuvConstants* yuvconstants,

                          int width);

 void I422ToARGB1555Row_C(const uint8* src_y,

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_argb4444,

+                         const struct YuvConstants* yuvconstants,

                          int width);

 void I422ToRGB565Row_C(const uint8* src_y,

                        const uint8* src_u,

                        const uint8* src_v,

                        uint8* dst_rgb565,

+                       const struct YuvConstants* yuvconstants,

                        int width);

 void I422ToARGBRow_AVX2(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

-void I422ToBGRARow_AVX2(const uint8* src_y,

+void I422ToARGBRow_AVX2(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

 void I422ToRGBARow_AVX2(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

-void I422ToABGRRow_AVX2(const uint8* src_y,

+void I444ToARGBRow_SSSE3(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

+                         int width);

+void I444ToARGBRow_AVX2(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

 void I444ToARGBRow_SSSE3(const uint8* src_y,

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width);

 void I444ToARGBRow_AVX2(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

 void I422ToARGBRow_SSSE3(const uint8* src_y,

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width);

+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,

+                              const uint8* u_buf,

+                              const uint8* v_buf,

+                              const uint8* a_buf,

+                              uint8* dst_argb,

+                              const struct YuvConstants* yuvconstants,

+                              int width);

+void I422AlphaToARGBRow_AVX2(const uint8* y_buf,

+                             const uint8* u_buf,

+                             const uint8* v_buf,

+                             const uint8* a_buf,

+                             uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void I422ToARGBRow_SSSE3(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

+                         int width);

 void I411ToARGBRow_SSSE3(const uint8* src_y,

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width);

 void I411ToARGBRow_AVX2(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

 void NV12ToARGBRow_SSSE3(const uint8* src_y,

                          const uint8* src_uv,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width);

-void NV21ToARGBRow_SSSE3(const uint8* src_y,

-                         const uint8* src_vu,

-                         uint8* dst_argb,

-                         int width);

 void NV12ToARGBRow_AVX2(const uint8* src_y,

                         const uint8* src_uv,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

-void NV21ToARGBRow_AVX2(const uint8* src_y,

-                        const uint8* src_vu,

-                        uint8* dst_argb,

-                        int width);

 void NV12ToRGB565Row_SSSE3(const uint8* src_y,

                            const uint8* src_uv,

                            uint8* dst_argb,

+                           const struct YuvConstants* yuvconstants,

                            int width);

-void NV21ToRGB565Row_SSSE3(const uint8* src_y,

-                           const uint8* src_vu,

-                           uint8* dst_argb,

-                           int width);

 void NV12ToRGB565Row_AVX2(const uint8* src_y,

                           const uint8* src_uv,

                           uint8* dst_argb,

+                          const struct YuvConstants* yuvconstants,

                           int width);

-void NV21ToRGB565Row_AVX2(const uint8* src_y,

-                          const uint8* src_vu,

-                          uint8* dst_argb,

-                          int width);

+void NV21ToARGBRow_SSSE3(const uint8* src_y,

+                         const uint8* src_uv,

+                         uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

+                         int width);

+void NV21ToARGBRow_AVX2(const uint8* src_y,

+                        const uint8* src_uv,

+                        uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

+                        int width);

 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width);

 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width);

 void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

 void UYVYToARGBRow_AVX2(const uint8* src_uyvy,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width);

-void J422ToARGBRow_SSSE3(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb,

-                         int width);

-void J422ToARGBRow_AVX2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

-                        int width);

-void I422ToBGRARow_SSSE3(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_bgra,

-                         int width);

-void I422ToABGRRow_SSSE3(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_abgr,

-                         int width);

 void I422ToRGBARow_SSSE3(const uint8* src_y,

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_rgba,

+                         const struct YuvConstants* yuvconstants,

                          int width);

 void I422ToARGB4444Row_SSSE3(const uint8* src_y,

                              const uint8* src_u,

                              const uint8* src_v,

                              uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

                              int width);

 void I422ToARGB4444Row_AVX2(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void I422ToARGB1555Row_SSSE3(const uint8* src_y,

                              const uint8* src_u,

                              const uint8* src_v,

                              uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

                              int width);

 void I422ToARGB1555Row_AVX2(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void I422ToRGB565Row_SSSE3(const uint8* src_y,

                            const uint8* src_u,

                            const uint8* src_v,

                            uint8* dst_argb,

+                           const struct YuvConstants* yuvconstants,

                            int width);

 void I422ToRGB565Row_AVX2(const uint8* src_y,

                           const uint8* src_u,

                           const uint8* src_v,

                           uint8* dst_argb,

+                          const struct YuvConstants* yuvconstants,

                           int width);

 void I422ToRGB24Row_SSSE3(const uint8* src_y,

                           const uint8* src_u,

                           const uint8* src_v,

                           uint8* dst_rgb24,

+                          const struct YuvConstants* yuvconstants,

                           int width);

 void I422ToRGB24Row_AVX2(const uint8* src_y,

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

                          int width);

-void I422ToRAWRow_SSSE3(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_raw,

-                        int width);

-void I422ToRAWRow_AVX2(const uint8* src_y,

-                       const uint8* src_u,

-                       const uint8* src_v,

-                       uint8* dst_raw,

-                       int width);

 void I422ToARGBRow_Any_AVX2(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToBGRARow_Any_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

-                            int width);

 void I422ToRGBARow_Any_AVX2(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToABGRRow_Any_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

-                            int width);

 void I444ToARGBRow_Any_SSSE3(const uint8* src_y,

                              const uint8* src_u,

                              const uint8* src_v,

                              uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

                              int width);

 void I444ToARGBRow_Any_AVX2(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void I422ToARGBRow_Any_SSSE3(const uint8* src_y,

                              const uint8* src_u,

                              const uint8* src_v,

                              uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

                              int width);

+void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,

+                                  const uint8* u_buf,

+                                  const uint8* v_buf,

+                                  const uint8* a_buf,

+                                  uint8* dst_argb,

+                                  const struct YuvConstants* yuvconstants,

+                                  int width);

+void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,

+                                 const uint8* u_buf,

+                                 const uint8* v_buf,

+                                 const uint8* a_buf,

+                                 uint8* dst_argb,

+                                 const struct YuvConstants* yuvconstants,

+                                 int width);

 void I411ToARGBRow_Any_SSSE3(const uint8* src_y,

                              const uint8* src_u,

                              const uint8* src_v,

                              uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

                              int width);

 void I411ToARGBRow_Any_AVX2(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,

                              const uint8* src_uv,

                              uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

                              int width);

-void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,

-                             const uint8* src_vu,

-                             uint8* dst_argb,

-                             int width);

 void NV12ToARGBRow_Any_AVX2(const uint8* src_y,

                             const uint8* src_uv,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,

+                             const uint8* src_vu,

+                             uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

 void NV21ToARGBRow_Any_AVX2(const uint8* src_y,

                             const uint8* src_vu,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,

                                const uint8* src_uv,

                                uint8* dst_argb,

+                               const struct YuvConstants* yuvconstants,

                                int width);

-void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,

-                               const uint8* src_vu,

-                               uint8* dst_argb,

-                               int width);

 void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,

                               const uint8* src_uv,

                               uint8* dst_argb,

+                              const struct YuvConstants* yuvconstants,

                               int width);

-void NV21ToRGB565Row_Any_AVX2(const uint8* src_y,

-                              const uint8* src_vu,

-                              uint8* dst_argb,

-                              int width);

 void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,

                              uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

                              int width);

 void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,

                              uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

                              int width);

 void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

-void J422ToARGBRow_Any_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_argb,

-                             int width);

-void J422ToARGBRow_Any_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

-                            int width);

-void I422ToBGRARow_Any_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_bgra,

-                             int width);

-void I422ToABGRRow_Any_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_abgr,

-                             int width);

 void I422ToRGBARow_Any_SSSE3(const uint8* src_y,

                              const uint8* src_u,

                              const uint8* src_v,

                              uint8* dst_rgba,

+                             const struct YuvConstants* yuvconstants,

                              int width);

 void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,

                                  const uint8* src_u,

                                  const uint8* src_v,

                                  uint8* dst_rgba,

+                                 const struct YuvConstants* yuvconstants,

                                  int width);

 void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,

                                 const uint8* src_u,

                                 const uint8* src_v,

                                 uint8* dst_rgba,

+                                const struct YuvConstants* yuvconstants,

                                 int width);

 void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,

                                  const uint8* src_u,

                                  const uint8* src_v,

                                  uint8* dst_rgba,

+                                 const struct YuvConstants* yuvconstants,

                                  int width);

 void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,

                                 const uint8* src_u,

                                 const uint8* src_v,

                                 uint8* dst_rgba,

+                                const struct YuvConstants* yuvconstants,

                                 int width);

 void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,

                                const uint8* src_u,

                                const uint8* src_v,

                                uint8* dst_rgba,

+                               const struct YuvConstants* yuvconstants,

                                int width);

 void I422ToRGB565Row_Any_AVX2(const uint8* src_y,

                               const uint8* src_u,

                               const uint8* src_v,

                               uint8* dst_rgba,

+                              const struct YuvConstants* yuvconstants,

                               int width);

 void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,

                               const uint8* src_u,

                               const uint8* src_v,

                               uint8* dst_argb,

+                              const struct YuvConstants* yuvconstants,

                               int width);

 void I422ToRGB24Row_Any_AVX2(const uint8* src_y,

                              const uint8* src_u,

                              const uint8* src_v,

                              uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

                              int width);

-void I422ToRAWRow_Any_SSSE3(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

-                            int width);

-void I422ToRAWRow_Any_AVX2(const uint8* src_y,

-                           const uint8* src_u,

-                           const uint8* src_v,

-                           uint8* dst_argb,

-                           int width);

 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);

 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);

@@ -1365,13 +1456,23 @@

 // ARGB preattenuated alpha blend.

 void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,

                         uint8* dst_argb, int width);

-void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,

-                       uint8* dst_argb, int width);

 void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,

                        uint8* dst_argb, int width);

 void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,

                     uint8* dst_argb, int width);

+// Unattenuated planar alpha blend.

+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,

+                         const uint8* alpha, uint8* dst, int width);

+void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,

+                             const uint8* alpha, uint8* dst, int width);

+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,

+                        const uint8* alpha, uint8* dst, int width);

+void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,

+                            const uint8* alpha, uint8* dst, int width);

+void BlendPlaneRow_C(const uint8* src0, const uint8* src1,

+                     const uint8* alpha, uint8* dst, int width);

 // ARGB multiply images. Same API as Blend, but these require

 // pointer and width alignment for SSE2.

 void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,

@@ -1422,26 +1523,32 @@

 void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,

                               uint8* dst_argb, int width);

-void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,

+                                int width);

+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,

+                                int width);

 void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,

-                                    const uint32 dither4, int pix);

+                                    const uint32 dither4, int width);

 void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,

-                                    const uint32 dither4, int pix);

+                                    const uint32 dither4, int width);

-void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,

+                                int width);

+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,

+                                int width);

-void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

-void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,

+                                int width);

+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,

+                                int width);

 void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,

                                     const uint32 dither4, int width);

@@ -1449,186 +1556,169 @@

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void I422ToARGBRow_Any_NEON(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

+void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,

+                                 const uint8* src_u,

+                                 const uint8* src_v,

+                                 const uint8* src_a,

+                                 uint8* dst_argb,

+                                 const struct YuvConstants* yuvconstants,

+                                 int width);

 void I411ToARGBRow_Any_NEON(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToBGRARow_Any_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

-                            int width);

-void I422ToABGRRow_Any_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

-                            int width);

 void I422ToRGBARow_Any_NEON(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void I422ToRGB24Row_Any_NEON(const uint8* src_y,

                              const uint8* src_u,

                              const uint8* src_v,

                              uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

                              int width);

-void I422ToRAWRow_Any_NEON(const uint8* src_y,

-                           const uint8* src_u,

-                           const uint8* src_v,

-                           uint8* dst_argb,

-                           int width);

 void I422ToARGB4444Row_Any_NEON(const uint8* src_y,

                                 const uint8* src_u,

                                 const uint8* src_v,

                                 uint8* dst_argb,

+                                const struct YuvConstants* yuvconstants,

                                 int width);

 void I422ToARGB1555Row_Any_NEON(const uint8* src_y,

                                 const uint8* src_u,

                                 const uint8* src_v,

                                 uint8* dst_argb,

+                                const struct YuvConstants* yuvconstants,

                                 int width);

 void I422ToRGB565Row_Any_NEON(const uint8* src_y,

                               const uint8* src_u,

                               const uint8* src_v,

                               uint8* dst_argb,

+                              const struct YuvConstants* yuvconstants,

                               int width);

 void NV12ToARGBRow_Any_NEON(const uint8* src_y,

                             const uint8* src_uv,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void NV21ToARGBRow_Any_NEON(const uint8* src_y,

-                            const uint8* src_uv,

+                            const uint8* src_vu,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void NV12ToRGB565Row_Any_NEON(const uint8* src_y,

                               const uint8* src_uv,

                               uint8* dst_argb,

+                              const struct YuvConstants* yuvconstants,

                               int width);

-void NV21ToRGB565Row_Any_NEON(const uint8* src_y,

-                              const uint8* src_uv,

-                              uint8* dst_argb,

-                              int width);

 void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

 void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,

                             uint8* dst_argb,

+                            const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,

-                              const uint8* src_u,

-                              const uint8* src_v,

-                              uint8* dst_argb,

-                              int width);

-void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,

-                              const uint8* src_u,

-                              const uint8* src_v,

-                              uint8* dst_argb,

-                              int width);

-void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,

-                              const uint8* src_u,

-                              const uint8* src_v,

-                              uint8* dst_argb,

-                              int width);

-void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,

-                              const uint8* src_u,

-                              const uint8* src_v,

-                              uint8* dst_argb,

-                              int width);

-void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,

-                              const uint8* src_u,

-                              const uint8* src_v,

-                              uint8* dst_argb,

-                              int width);

-void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,

-                              const uint8* src_u,

-                              const uint8* src_v,

-                              uint8* dst_argb,

-                              int width);

+void I422ToARGBRow_DSPR2(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

+                         int width);

+void I422ToARGBRow_DSPR2(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

+                         int width);

-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);

+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);

 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int pix);

-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);

+                         uint8* dst_u, uint8* dst_v, int width);

+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);

 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int pix);

-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);

+                         uint8* dst_u, uint8* dst_v, int width);

+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);

 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void YUY2ToUV422Row_NEON(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int pix);

-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);

+                         uint8* dst_u, uint8* dst_v, int width);

+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);

 void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,

-                   uint8* dst_u, uint8* dst_v, int pix);

+                   uint8* dst_u, uint8* dst_v, int width);

 void YUY2ToUV422Row_C(const uint8* src_yuy2,

-                      uint8* dst_u, uint8* dst_v, int pix);

-void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);

 void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,

-                             uint8* dst_u, uint8* dst_v, int pix);

-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);

+                             uint8* dst_u, uint8* dst_v, int width);

+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);

 void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,

-                             uint8* dst_u, uint8* dst_v, int pix);

-void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);

+                             uint8* dst_u, uint8* dst_v, int width);

+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);

 void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,

-                             uint8* dst_u, uint8* dst_v, int pix);

-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);

+                             uint8* dst_u, uint8* dst_v, int width);

+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);

 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int pix);

-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);

+                         uint8* dst_u, uint8* dst_v, int width);

+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);

 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int pix);

-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);

+                         uint8* dst_u, uint8* dst_v, int width);

+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);

 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int pix);

-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);

+                         uint8* dst_u, uint8* dst_v, int width);

+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);

 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

 void UYVYToUV422Row_NEON(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int pix);

+                         uint8* dst_u, uint8* dst_v, int width);

-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);

+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);

 void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,

-                   uint8* dst_u, uint8* dst_v, int pix);

+                   uint8* dst_u, uint8* dst_v, int width);

 void UYVYToUV422Row_C(const uint8* src_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix);

-void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);

+                      uint8* dst_u, uint8* dst_v, int width);

+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);

 void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,

-                             uint8* dst_u, uint8* dst_v, int pix);

-void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);

+                             uint8* dst_u, uint8* dst_v, int width);

+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);

 void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,

-                             uint8* dst_u, uint8* dst_v, int pix);

-void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);

+                             uint8* dst_u, uint8* dst_v, int width);

+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);

 void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,

-                          uint8* dst_u, uint8* dst_v, int pix);

+                          uint8* dst_u, uint8* dst_v, int width);

 void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,

-                             uint8* dst_u, uint8* dst_v, int pix);

+                             uint8* dst_u, uint8* dst_v, int width);

 void I422ToYUY2Row_C(const uint8* src_y,

                      const uint8* src_u,

@@ -1673,7 +1763,6 @@

 // Effects related row functions.

 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);

 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);

 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);

 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);

@@ -1753,9 +1842,6 @@

 void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,

                       ptrdiff_t src_stride_ptr,

                       int width, int source_y_fraction);

-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,

-                         ptrdiff_t src_stride_ptr, int width,

-                         int source_y_fraction);

 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

                           ptrdiff_t src_stride_ptr, int width,

                           int source_y_fraction);

@@ -1765,15 +1851,12 @@

 void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,

                          ptrdiff_t src_stride_ptr, int width,

                          int source_y_fraction);

-void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

-                               ptrdiff_t src_stride_ptr, int width,

-                               int source_y_fraction);

+void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

+                          ptrdiff_t src_stride_ptr, int width,

+                          int source_y_fraction);

 void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,

                              ptrdiff_t src_stride_ptr, int width,

                              int source_y_fraction);

-void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,

-                             ptrdiff_t src_stride_ptr, int width,

-                             int source_y_fraction);

 void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

                               ptrdiff_t src_stride_ptr, int width,

                               int source_y_fraction);

@@ -1780,9 +1863,9 @@

 void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,

                              ptrdiff_t src_stride_ptr, int width,

                              int source_y_fraction);

-void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

-                                   ptrdiff_t src_stride_ptr, int width,

-                                   int source_y_fraction);

+void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

+                              ptrdiff_t src_stride_ptr, int width,

+                              int source_y_fraction);

 void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,

                          ptrdiff_t src_stride_ptr,

--- a/third_party/libyuv/include/libyuv/scale_argb.h

+++ b/third_party/libyuv/include/libyuv/scale_argb.h

@@ -35,7 +35,6 @@

                   int clip_x, int clip_y, int clip_width, int clip_height,

                   enum FilterMode filtering);

-// TODO(fbarchard): Implement this.

 // Scale with YUV conversion to ARGB and clipping.

 LIBYUV_API

 int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,

--- a/third_party/libyuv/include/libyuv/scale_row.h

+++ b/third_party/libyuv/include/libyuv/scale_row.h

@@ -23,7 +23,27 @@

     (defined(__i386__) && !defined(__SSE2__))

 #define LIBYUV_DISABLE_X86

 #endif

+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505

+#if defined(__has_feature)

+#if __has_feature(memory_sanitizer)

+#define LIBYUV_DISABLE_X86

+#endif

+#endif

+// GCC >= 4.7.0 required for AVX2.

+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))

+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))

+#define GCC_HAS_AVX2 1

+#endif  // GNUC >= 4.7

+#endif  // __GNUC__

+// clang >= 3.4.0 required for AVX2.

+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))

+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))

+#define CLANG_HAS_AVX2 1

+#endif  // clang >= 3.4

+#endif  // __clang__

 // Visual C 2012 required for AVX2.

 #if defined(_M_IX86) && !defined(__clang__) && \

     defined(_MSC_VER) && _MSC_VER >= 1700

@@ -42,24 +62,23 @@

 #define HAS_SCALEARGBROWDOWNEVEN_SSE2

 #define HAS_SCALECOLSUP2_SSE2

 #define HAS_SCALEFILTERCOLS_SSSE3

-#define HAS_SCALEROWDOWN2_SSE2

+#define HAS_SCALEROWDOWN2_SSSE3

 #define HAS_SCALEROWDOWN34_SSSE3

 #define HAS_SCALEROWDOWN38_SSSE3

-#define HAS_SCALEROWDOWN4_SSE2

+#define HAS_SCALEROWDOWN4_SSSE3

+#define HAS_SCALEADDROW_SSE2

 #endif

-// The following are available on VS2012:

-#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)

+// The following are available on all x86 platforms, but

+// require VS2012, clang 3.4 or gcc 4.7.

+// The code supports NaCL but requires a new compiler and validator.

+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \

+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))

 #define HAS_SCALEADDROW_AVX2

 #define HAS_SCALEROWDOWN2_AVX2

 #define HAS_SCALEROWDOWN4_AVX2

 #endif

-// The following are available on Visual C:

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)

-#define HAS_SCALEADDROW_SSE2

-#endif

 // The following are available on Neon platforms:

 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \

     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))

@@ -77,10 +96,10 @@

 // The following are available on Mips platforms:

 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \

     defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)

-#define HAS_SCALEROWDOWN2_MIPS_DSPR2

-#define HAS_SCALEROWDOWN4_MIPS_DSPR2

-#define HAS_SCALEROWDOWN34_MIPS_DSPR2

-#define HAS_SCALEROWDOWN38_MIPS_DSPR2

+#define HAS_SCALEROWDOWN2_DSPR2

+#define HAS_SCALEROWDOWN4_DSPR2

+#define HAS_SCALEROWDOWN34_DSPR2

+#define HAS_SCALEROWDOWN38_DSPR2

 #endif

 // Scale ARGB vertically with bilinear interpolation.

@@ -133,6 +152,8 @@

                               uint16* dst, int dst_width);

 void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

                         uint8* dst, int dst_width);

+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst, int dst_width);

 void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

                            uint16* dst, int dst_width);

 void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,

@@ -214,12 +235,12 @@

                              int dst_width, int x, int dx);

 // Specialized scalers for x86.

-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width);

+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                         uint8* dst_ptr, int dst_width);

+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width);

+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst_ptr, int dst_width);

 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

                         uint8* dst_ptr, int dst_width);

 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

@@ -226,10 +247,10 @@

                               uint8* dst_ptr, int dst_width);

 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

                            uint8* dst_ptr, int dst_width);

-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width);

-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width);

+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                         uint8* dst_ptr, int dst_width);

+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst_ptr, int dst_width);

 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

                         uint8* dst_ptr, int dst_width);

 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

@@ -251,22 +272,26 @@

 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

                                 ptrdiff_t src_stride,

                                 uint8* dst_ptr, int dst_width);

-void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                  uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

+void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                             uint8* dst_ptr, int dst_width);

+void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                                   uint8* dst_ptr, int dst_width);

+void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width);

+void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width);

 void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

                             uint8* dst_ptr, int dst_width);

 void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

                                   uint8* dst_ptr, int dst_width);

 void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width);

-void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width);

-void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

                                uint8* dst_ptr, int dst_width);

+void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width);

+void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                             uint8* dst_ptr, int dst_width);

+void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width);

 void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

                             uint8* dst_ptr, int dst_width);

 void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

@@ -418,6 +443,8 @@

                                   uint8* dst, int dst_width);

 void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

                                uint8* dst, int dst_width);

+void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst, int dst_width);

 void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

                             uint8* dst_ptr, int dst_width);

 void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

@@ -447,28 +474,26 @@

 void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,

                               int dst_width, int x, int dx);

-void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst, int dst_width);

-void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                 uint8* dst, int dst_width);

-void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst, int dst_width);

-void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                 uint8* dst, int dst_width);

-void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst, int dst_width);

-void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                     uint8* d, int dst_width);

-void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                     uint8* d, int dst_width);

-void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst, int dst_width);

-void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                     uint8* dst_ptr, int dst_width);

-void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,

-                                     ptrdiff_t src_stride,

-                                     uint8* dst_ptr, int dst_width);

+void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                         uint8* dst, int dst_width);

+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst, int dst_width);

+void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                         uint8* dst, int dst_width);

+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst, int dst_width);

+void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                          uint8* dst, int dst_width);

+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                uint8* d, int dst_width);

+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                uint8* d, int dst_width);

+void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                          uint8* dst, int dst_width);

+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width);

+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width);

 #ifdef __cplusplus

 }  // extern "C"

--- a/third_party/libyuv/include/libyuv/version.h

+++ b/third_party/libyuv/include/libyuv/version.h

@@ -11,6 +11,6 @@

 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT

 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1456

+#define LIBYUV_VERSION 1600

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT

--- a/third_party/libyuv/include/libyuv/video_common.h

+++ b/third_party/libyuv/include/libyuv/video_common.h

@@ -62,7 +62,7 @@

   // 2 Secondary YUV formats: row biplanar.

   FOURCC_M420 = FOURCC('M', '4', '2', '0'),

-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.

+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),  // deprecated.

   // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.

   FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),

@@ -90,7 +90,8 @@

   FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),

   FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.

   FOURCC_J420 = FOURCC('J', '4', '2', '0'),

-  FOURCC_J400 = FOURCC('J', '4', '0', '0'),

+  FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc

+  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc

   // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.

   FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.

@@ -150,6 +151,7 @@

   FOURCC_BPP_YU12 = 12,

   FOURCC_BPP_J420 = 12,

   FOURCC_BPP_J400 = 8,

+  FOURCC_BPP_H420 = 12,

   FOURCC_BPP_MJPG = 0,  // 0 means unknown.

   FOURCC_BPP_H264 = 0,

   FOURCC_BPP_IYUV = 12,

--- a/third_party/libyuv/source/compare.cc

+++ b/third_party/libyuv/source/compare.cc

@@ -17,6 +17,7 @@

 #endif

 #include "libyuv/basic_types.h"

+#include "libyuv/compare_row.h"

 #include "libyuv/cpu_id.h"

 #include "libyuv/row.h"

 #include "libyuv/video_common.h"

@@ -27,29 +28,12 @@

 #endif

 // hash seed of 5381 recommended.

-// Internal C version of HashDjb2 with int sized count for efficiency.

-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);

-// This module is for Visual C x86

-#if !defined(LIBYUV_DISABLE_X86) && \

-    (defined(_M_IX86) || \

-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))

-#define HAS_HASHDJB2_SSE41

-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);

-#ifdef VISUALC_HAS_AVX2

-#define HAS_HASHDJB2_AVX2

-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);

-#endif

-#endif  // HAS_HASHDJB2_SSE41

-// hash seed of 5381 recommended.

 LIBYUV_API

 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {

   const int kBlockSize = 1 << 15;  // 32768;

   int remainder;

-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;

+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =

+      HashDjb2_C;

 #if defined(HAS_HASHDJB2_SSE41)

   if (TestCpuFlag(kCpuHasSSE41)) {

     HashDjb2_SSE = HashDjb2_SSE41;

@@ -126,23 +110,6 @@

   return fourcc;

-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);

-#if !defined(LIBYUV_DISABLE_NEON) && \

-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))

-#define HAS_SUMSQUAREERROR_NEON

-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);

-#endif

-#if !defined(LIBYUV_DISABLE_X86) && \

-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

-#define HAS_SUMSQUAREERROR_SSE2

-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);

-#endif

-#ifdef VISUALC_HAS_AVX2

-#define HAS_SUMSQUAREERROR_AVX2

-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);

-#endif

 // TODO(fbarchard): Refactor into row function.

 LIBYUV_API

--- a/third_party/libyuv/source/compare_common.cc

+++ b/third_party/libyuv/source/compare_common.cc

@@ -10,6 +10,8 @@

 #include "libyuv/basic_types.h"

+#include "libyuv/compare_row.h"

 #ifdef __cplusplus

 namespace libyuv {

 extern "C" {

--- a/third_party/libyuv/source/compare_gcc.cc

+++ b/third_party/libyuv/source/compare_gcc.cc

@@ -9,6 +9,8 @@

*/

 #include "libyuv/basic_types.h"

+#include "libyuv/compare_row.h"

 #include "libyuv/row.h"

 #ifdef __cplusplus

@@ -16,11 +18,13 @@

 extern "C" {

 #endif

-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))

+// This module is for GCC x86 and x64.

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))

 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {

   uint32 sse;

-  asm volatile (  // NOLINT

+  asm volatile (

     "pxor      %%xmm0,%%xmm0                   \n"

     "pxor      %%xmm5,%%xmm5                   \n"

     LABELALIGN

@@ -54,15 +58,10 @@

     "+r"(count),      // %2

     "=g"(sse)         // %3

   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );  // NOLINT

+  );

   return sse;

-#endif  // defined(__x86_64__) || defined(__i386__)

-#if !defined(LIBYUV_DISABLE_X86) && \

-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))

-#define HAS_HASHDJB2_SSE41

 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16

 static uvec32 kHashMul0 = {

   0x0c3525e1,  // 33 ^ 15

@@ -91,7 +90,7 @@

 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {

   uint32 hash;

-  asm volatile (  // NOLINT

+  asm volatile (

     "movd      %2,%%xmm0                       \n"

     "pxor      %%xmm7,%%xmm7                   \n"

     "movdqa    %4,%%xmm6                       \n"

@@ -140,7 +139,7 @@

     "m"(kHashMul3)    // %8

   : "memory", "cc"

     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );  // NOLINT

+  );

   return hash;

 #endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))

--- a/third_party/libyuv/source/compare_neon.cc

+++ b/third_party/libyuv/source/compare_neon.cc

@@ -9,6 +9,8 @@

*/

 #include "libyuv/basic_types.h"

+#include "libyuv/compare_row.h"

 #include "libyuv/row.h"

 #ifdef __cplusplus

@@ -27,7 +29,6 @@

     "vmov.u8    q9, #0                         \n"

     "vmov.u8    q11, #0                        \n"

-    ".p2align  2                               \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"

--- a/third_party/libyuv/source/compare_neon64.cc

+++ b/third_party/libyuv/source/compare_neon64.cc

@@ -9,6 +9,8 @@

*/

 #include "libyuv/basic_types.h"

+#include "libyuv/compare_row.h"

 #include "libyuv/row.h"

 #ifdef __cplusplus

@@ -26,7 +28,6 @@

     "eor        v17.16b, v17.16b, v17.16b      \n"

     "eor        v19.16b, v19.16b, v19.16b      \n"

-    ".p2align  2                               \n"

   "1:                                          \n"

     MEMACCESS(0)

     "ld1        {v0.16b}, [%0], #16            \n"

--- a/third_party/libyuv/source/compare_win.cc

+++ b/third_party/libyuv/source/compare_win.cc

@@ -9,6 +9,8 @@

*/

 #include "libyuv/basic_types.h"

+#include "libyuv/compare_row.h"

 #include "libyuv/row.h"

 #ifdef __cplusplus

@@ -16,9 +18,8 @@

 extern "C" {

 #endif

-// This module is for Visual C x86.

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \

-    defined(_MSC_VER) && !defined(__clang__)

+// This module is for 32 bit Visual C x86 and clangcl

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

 __declspec(naked)

 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {

@@ -100,27 +101,26 @@

 #endif  // _MSC_VER >= 1700

-#define HAS_HASHDJB2_SSE41

-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16

-static uvec32 kHashMul0 = {

+uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16

+uvec32 kHashMul0 = {

   0x0c3525e1,  // 33 ^ 15

   0xa3476dc1,  // 33 ^ 14

   0x3b4039a1,  // 33 ^ 13

   0x4f5f0981,  // 33 ^ 12

};

-static uvec32 kHashMul1 = {

+uvec32 kHashMul1 = {

   0x30f35d61,  // 33 ^ 11

   0x855cb541,  // 33 ^ 10

   0x040a9121,  // 33 ^ 9

   0x747c7101,  // 33 ^ 8

};

-static uvec32 kHashMul2 = {

+uvec32 kHashMul2 = {

   0xec41d4e1,  // 33 ^ 7

   0x4cfa3cc1,  // 33 ^ 6

   0x025528a1,  // 33 ^ 5

   0x00121881,  // 33 ^ 4

};

-static uvec32 kHashMul3 = {

+uvec32 kHashMul3 = {

   0x00008c61,  // 33 ^ 3

   0x00000441,  // 33 ^ 2

   0x00000021,  // 33 ^ 1

@@ -127,14 +127,6 @@

   0x00000001,  // 33 ^ 0

};

-// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6

-// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5

-// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5

-// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5

-// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5

-#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \

-    _asm _emit 0x40 _asm _emit reg

 __declspec(naked)

 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {

   __asm {

@@ -143,30 +135,30 @@

     movd       xmm0, [esp + 12]  // seed

     pxor       xmm7, xmm7        // constant 0 for unpck

-    movdqa     xmm6, kHash16x33

+    movdqa     xmm6, xmmword ptr kHash16x33

   wloop:

     movdqu     xmm1, [eax]       // src[0-15]

     lea        eax, [eax + 16]

-    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16

-    movdqa     xmm5, kHashMul0

+    pmulld     xmm0, xmm6        // hash *= 33 ^ 16

+    movdqa     xmm5, xmmword ptr kHashMul0

     movdqa     xmm2, xmm1

     punpcklbw  xmm2, xmm7        // src[0-7]

     movdqa     xmm3, xmm2

     punpcklwd  xmm3, xmm7        // src[0-3]

-    pmulld(0xdd)                 // pmulld     xmm3, xmm5

-    movdqa     xmm5, kHashMul1

+    pmulld     xmm3, xmm5

+    movdqa     xmm5, xmmword ptr kHashMul1

     movdqa     xmm4, xmm2

     punpckhwd  xmm4, xmm7        // src[4-7]

-    pmulld(0xe5)                 // pmulld     xmm4, xmm5

-    movdqa     xmm5, kHashMul2

+    pmulld     xmm4, xmm5

+    movdqa     xmm5, xmmword ptr kHashMul2

     punpckhbw  xmm1, xmm7        // src[8-15]

     movdqa     xmm2, xmm1

     punpcklwd  xmm2, xmm7        // src[8-11]

-    pmulld(0xd5)                 // pmulld     xmm2, xmm5

-    movdqa     xmm5, kHashMul3

+    pmulld     xmm2, xmm5

+    movdqa     xmm5, xmmword ptr kHashMul3

     punpckhwd  xmm1, xmm7        // src[12-15]

-    pmulld(0xcd)                 // pmulld     xmm1, xmm5

+    pmulld     xmm1, xmm5

     paddd      xmm3, xmm4        // add 16 results

     paddd      xmm1, xmm2

     paddd      xmm1, xmm3

@@ -191,36 +183,37 @@

   __asm {

     mov        eax, [esp + 4]    // src

     mov        ecx, [esp + 8]    // count

-    movd       xmm0, [esp + 12]  // seed

-    movdqa     xmm6, kHash16x33

+    vmovd      xmm0, [esp + 12]  // seed

   wloop:

-    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]

-    pmulld     xmm0, xmm6  // hash *= 33 ^ 16

-    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]

-    pmulld     xmm3, kHashMul0

-    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]

-    pmulld     xmm4, kHashMul1

-    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]

-    pmulld     xmm2, kHashMul2

+    vpmovzxbd  xmm3, [eax]  // src[0-3]

+    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16

+    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]

+    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0

+    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]

+    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1

+    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]

+    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2

     lea        eax, [eax + 16]

-    pmulld     xmm1, kHashMul3

-    paddd      xmm3, xmm4        // add 16 results

-    paddd      xmm1, xmm2

-    paddd      xmm1, xmm3

-    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords

-    paddd      xmm1, xmm2

-    pshufd     xmm2, xmm1, 0x01

-    paddd      xmm1, xmm2

-    paddd      xmm0, xmm1

+    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3

+    vpaddd     xmm3, xmm3, xmm4        // add 16 results

+    vpaddd     xmm1, xmm1, xmm2

+    vpaddd     xmm1, xmm1, xmm3

+    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords

+    vpaddd     xmm1, xmm1,xmm2

+    vpshufd    xmm2, xmm1, 0x01

+    vpaddd     xmm1, xmm1, xmm2

+    vpaddd     xmm0, xmm0, xmm1

     sub        ecx, 16

     jg         wloop

-    movd       eax, xmm0         // return hash

+    vmovd      eax, xmm0         // return hash

+    vzeroupper

ret

 #endif  // _MSC_VER >= 1700

 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

 #ifdef __cplusplus

--- a/third_party/libyuv/source/convert.cc

+++ b/third_party/libyuv/source/convert.cc

@@ -245,8 +245,8 @@

   int y;

   int halfwidth = (width + 1) >> 1;

   int halfheight = (height + 1) >> 1;

-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =

-      SplitUVRow_C;

+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width) = SplitUVRow_C;

   if (!src_y || !src_uv ||

       !dst_y || !dst_u || !dst_v ||

       width <= 0 || height == 0) {

@@ -303,14 +303,14 @@

 #endif

-#if defined(HAS_SPLITUVROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+#if defined(HAS_SPLITUVROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) &&

       IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&

       IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&

       IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {

-    SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;

+    SplitUVRow = SplitUVRow_Any_DSPR2;

     if (IS_ALIGNED(halfwidth, 16)) {

-      SplitUVRow = SplitUVRow_MIPS_DSPR2;

+      SplitUVRow = SplitUVRow_DSPR2;

 #endif

@@ -390,9 +390,9 @@

                int width, int height) {

   int y;

   void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,

-      uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;

+      uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;

   void (*YUY2ToYRow)(const uint8* src_yuy2,

-      uint8* dst_y, int pix) = YUY2ToYRow_C;

+      uint8* dst_y, int width) = YUY2ToYRow_C;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

@@ -455,9 +455,9 @@

                int width, int height) {

   int y;

   void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,

-      uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;

+      uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;

   void (*UYVYToYRow)(const uint8* src_uyvy,

-      uint8* dst_y, int pix) = UYVYToYRow_C;

+      uint8* dst_y, int width) = UYVYToYRow_C;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

@@ -521,7 +521,7 @@

   int y;

   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

   if (!src_argb ||

       !dst_y || !dst_u || !dst_v ||

@@ -597,7 +597,7 @@

   int y;

   void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,

       uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;

-  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =

+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =

       BGRAToYRow_C;

   if (!src_bgra ||

       !dst_y || !dst_u || !dst_v ||

@@ -663,7 +663,7 @@

   int y;

   void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,

       uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;

-  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =

+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =

       ABGRToYRow_C;

   if (!src_abgr ||

       !dst_y || !dst_u || !dst_v ||

@@ -729,7 +729,7 @@

   int y;

   void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,

       uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;

-  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =

+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =

       RGBAToYRow_C;

   if (!src_rgba ||

       !dst_y || !dst_u || !dst_v ||

@@ -796,14 +796,14 @@

 #if defined(HAS_RGB24TOYROW_NEON)

   void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,

       uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;

-  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =

+  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =

       RGB24ToYRow_C;

 #else

-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

       RGB24ToARGBRow_C;

   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

 #endif

   if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||

@@ -910,14 +910,14 @@

 #if defined(HAS_RAWTOYROW_NEON)

   void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,

       uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;

-  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =

+  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =

       RAWToYRow_C;

 #else

-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

       RAWToARGBRow_C;

   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

 #endif

   if (!src_raw || !dst_y || !dst_u || !dst_v ||

@@ -1024,14 +1024,14 @@

 #if defined(HAS_RGB565TOYROW_NEON)

   void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,

       uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;

-  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =

+  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =

       RGB565ToYRow_C;

 #else

-  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

       RGB565ToARGBRow_C;

   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

 #endif

   if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||

@@ -1146,14 +1146,14 @@

 #if defined(HAS_ARGB1555TOYROW_NEON)

   void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,

       uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;

-  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =

+  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =

       ARGB1555ToYRow_C;

 #else

-  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

       ARGB1555ToARGBRow_C;

   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

 #endif

   if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||

@@ -1270,14 +1270,14 @@

 #if defined(HAS_ARGB4444TOYROW_NEON)

   void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,

       uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;

-  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =

+  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =

       ARGB4444ToYRow_C;

 #else

-  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

       ARGB4444ToARGBRow_C;

   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

 #endif

   if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||

--- a/third_party/libyuv/source/convert_argb.cc

+++ b/third_party/libyuv/source/convert_argb.cc

@@ -14,6 +14,7 @@

 #ifdef HAVE_JPEG

 #include "libyuv/mjpeg_decoder.h"

 #endif

+#include "libyuv/planar_functions.h"  // For CopyPlane and ARGBShuffle.

 #include "libyuv/rotate_argb.h"

 #include "libyuv/row.h"

 #include "libyuv/video_common.h"

@@ -44,21 +45,21 @@

   return 0;

-// Convert I444 to ARGB.

-LIBYUV_API

-int I444ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+// Convert I422 to ARGB with matrix

+static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,

+                            const uint8* src_u, int src_stride_u,

+                            const uint8* src_v, int src_stride_v,

+                            uint8* dst_argb, int dst_stride_argb,

+                            const struct YuvConstants* yuvconstants,

+                            int width, int height) {

   int y;

-  void (*I444ToARGBRow)(const uint8* y_buf,

+  void (*I422ToARGBRow)(const uint8* y_buf,

                         const uint8* u_buf,

                         const uint8* v_buf,

                         uint8* rgb_buf,

-                        int width) = I444ToARGBRow_C;

-  if (!src_y || !src_u || !src_v ||

-      !dst_argb ||

+                        const struct YuvConstants* yuvconstants,

+                        int width) = I422ToARGBRow_C;

+  if (!src_y || !src_u || !src_v || !dst_argb ||

       width <= 0 || height == 0) {

     return -1;

@@ -68,62 +69,155 @@

     dst_argb = dst_argb + (height - 1) * dst_stride_argb;

     dst_stride_argb = -dst_stride_argb;

-  // Coalesce rows.

-  if (src_stride_y == width &&

-      src_stride_u == width &&

-      src_stride_v == width &&

-      dst_stride_argb == width * 4) {

-    width *= height;

-    height = 1;

-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;

-  }

-#if defined(HAS_I444TOARGBROW_SSSE3)

+#if defined(HAS_I422TOARGBROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;

+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;

     if (IS_ALIGNED(width, 8)) {

-      I444ToARGBRow = I444ToARGBRow_SSSE3;

+      I422ToARGBRow = I422ToARGBRow_SSSE3;

 #endif

-#if defined(HAS_I444TOARGBROW_AVX2)

+#if defined(HAS_I422TOARGBROW_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

-    I444ToARGBRow = I444ToARGBRow_Any_AVX2;

+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;

     if (IS_ALIGNED(width, 16)) {

-      I444ToARGBRow = I444ToARGBRow_AVX2;

+      I422ToARGBRow = I422ToARGBRow_AVX2;

 #endif

-#if defined(HAS_I444TOARGBROW_NEON)

+#if defined(HAS_I422TOARGBROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

-    I444ToARGBRow = I444ToARGBRow_Any_NEON;

+    I422ToARGBRow = I422ToARGBRow_Any_NEON;

     if (IS_ALIGNED(width, 8)) {

-      I444ToARGBRow = I444ToARGBRow_NEON;

+      I422ToARGBRow = I422ToARGBRow_NEON;

 #endif

+#if defined(HAS_I422TOARGBROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

+    I422ToARGBRow = I422ToARGBRow_DSPR2;

+  }

+#endif

   for (y = 0; y < height; ++y) {

-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);

+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);

     dst_argb += dst_stride_argb;

     src_y += src_stride_y;

-    src_u += src_stride_u;

-    src_v += src_stride_v;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

   return 0;

-// Convert I422 to ARGB.

+// Convert I420 to ARGB.

 LIBYUV_API

-int I422ToARGB(const uint8* src_y, int src_stride_y,

+int I420ToARGB(const uint8* src_y, int src_stride_y,

                const uint8* src_u, int src_stride_u,

                const uint8* src_v, int src_stride_v,

                uint8* dst_argb, int dst_stride_argb,

                int width, int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y,

+                          src_u, src_stride_u,

+                          src_v, src_stride_v,

+                          dst_argb, dst_stride_argb,

+                          &kYuvI601Constants,

+                          width, height);

+}

+// Convert I420 to ABGR.

+LIBYUV_API

+int I420ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y,

+                          src_v, src_stride_v,  // Swap U and V

+                          src_u, src_stride_u,

+                          dst_abgr, dst_stride_abgr,

+                          &kYvuI601Constants,  // Use Yvu matrix

+                          width, height);

+}

+// Convert J420 to ARGB.

+LIBYUV_API

+int J420ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y,

+                          src_u, src_stride_u,

+                          src_v, src_stride_v,

+                          dst_argb, dst_stride_argb,

+                          &kYuvJPEGConstants,

+                          width, height);

+}

+// Convert J420 to ABGR.

+LIBYUV_API

+int J420ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y,

+                          src_v, src_stride_v,  // Swap U and V

+                          src_u, src_stride_u,

+                          dst_abgr, dst_stride_abgr,

+                          &kYvuJPEGConstants,  // Use Yvu matrix

+                          width, height);

+}

+// Convert H420 to ARGB.

+LIBYUV_API

+int H420ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y,

+                          src_u, src_stride_u,

+                          src_v, src_stride_v,

+                          dst_argb, dst_stride_argb,

+                          &kYuvH709Constants,

+                          width, height);

+}

+// Convert H420 to ABGR.

+LIBYUV_API

+int H420ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y,

+                          src_v, src_stride_v,  // Swap U and V

+                          src_u, src_stride_u,

+                          dst_abgr, dst_stride_abgr,

+                          &kYvuH709Constants,  // Use Yvu matrix

+                          width, height);

+}

+// Convert I422 to ARGB with matrix

+static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,

+                            const uint8* src_u, int src_stride_u,

+                            const uint8* src_v, int src_stride_v,

+                            uint8* dst_argb, int dst_stride_argb,

+                            const struct YuvConstants* yuvconstants,

+                            int width, int height) {

   int y;

   void (*I422ToARGBRow)(const uint8* y_buf,

                         const uint8* u_buf,

                         const uint8* v_buf,

                         uint8* rgb_buf,

+                        const struct YuvConstants* yuvconstants,

                         int width) = I422ToARGBRow_C;

   if (!src_y || !src_u || !src_v ||

       !dst_argb ||

@@ -169,18 +263,18 @@

 #endif

-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

+#if defined(HAS_I422TOARGBROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

       IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

       IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

       IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;

+    I422ToARGBRow = I422ToARGBRow_DSPR2;

 #endif

   for (y = 0; y < height; ++y) {

-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);

+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);

     dst_argb += dst_stride_argb;

     src_y += src_stride_y;

     src_u += src_stride_u;

@@ -189,6 +283,210 @@

   return 0;

+// Convert I422 to ARGB.

+LIBYUV_API

+int I422ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y,

+                          src_u, src_stride_u,

+                          src_v, src_stride_v,

+                          dst_argb, dst_stride_argb,

+                          &kYuvI601Constants,

+                          width, height);

+}

+// Convert I422 to ABGR.

+LIBYUV_API

+int I422ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y,

+                          src_v, src_stride_v,  // Swap U and V

+                          src_u, src_stride_u,

+                          dst_abgr, dst_stride_abgr,

+                          &kYvuI601Constants,  // Use Yvu matrix

+                          width, height);

+}

+// Convert J422 to ARGB.

+LIBYUV_API

+int J422ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y,

+                          src_u, src_stride_u,

+                          src_v, src_stride_v,

+                          dst_argb, dst_stride_argb,

+                          &kYuvJPEGConstants,

+                          width, height);

+}

+// Convert J422 to ABGR.

+LIBYUV_API

+int J422ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y,

+                          src_v, src_stride_v,  // Swap U and V

+                          src_u, src_stride_u,

+                          dst_abgr, dst_stride_abgr,

+                          &kYvuJPEGConstants,  // Use Yvu matrix

+                          width, height);

+}

+// Convert H422 to ARGB.

+LIBYUV_API

+int H422ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y,

+                          src_u, src_stride_u,

+                          src_v, src_stride_v,

+                          dst_argb, dst_stride_argb,

+                          &kYuvH709Constants,

+                          width, height);

+}

+// Convert H422 to ABGR.

+LIBYUV_API

+int H422ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y,

+                          src_v, src_stride_v,  // Swap U and V

+                          src_u, src_stride_u,

+                          dst_abgr, dst_stride_abgr,

+                          &kYvuH709Constants,  // Use Yvu matrix

+                          width, height);

+}

+// Convert I444 to ARGB with matrix

+static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,

+                            const uint8* src_u, int src_stride_u,

+                            const uint8* src_v, int src_stride_v,

+                            uint8* dst_argb, int dst_stride_argb,

+                            const struct YuvConstants* yuvconstants,

+                            int width, int height) {

+  int y;

+  void (*I444ToARGBRow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        const struct YuvConstants* yuvconstants,

+                        int width) = I444ToARGBRow_C;

+  if (!src_y || !src_u || !src_v ||

+      !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      src_stride_u == width &&

+      src_stride_v == width &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;

+  }

+#if defined(HAS_I444TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I444ToARGBRow = I444ToARGBRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_I444TOARGBROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      I444ToARGBRow = I444ToARGBRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_I444TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    I444ToARGBRow = I444ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I444ToARGBRow = I444ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);

+    dst_argb += dst_stride_argb;

+    src_y += src_stride_y;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+  }

+  return 0;

+}

+// Convert I444 to ARGB.

+LIBYUV_API

+int I444ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return I444ToARGBMatrix(src_y, src_stride_y,

+                          src_u, src_stride_u,

+                          src_v, src_stride_v,

+                          dst_argb, dst_stride_argb,

+                          &kYuvI601Constants,

+                          width, height);

+}

+// Convert I444 to ABGR.

+LIBYUV_API

+int I444ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height) {

+  return I444ToARGBMatrix(src_y, src_stride_y,

+                          src_v, src_stride_v,  // Swap U and V

+                          src_u, src_stride_u,

+                          dst_abgr, dst_stride_abgr,

+                          &kYvuI601Constants,  // Use Yvu matrix

+                          width, height);

+}

+// Convert J444 to ARGB.

+LIBYUV_API

+int J444ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return I444ToARGBMatrix(src_y, src_stride_y,

+                          src_u, src_stride_u,

+                          src_v, src_stride_v,

+                          dst_argb, dst_stride_argb,

+                          &kYuvJPEGConstants,

+                          width, height);

+}

 // Convert I411 to ARGB.

 LIBYUV_API

 int I411ToARGB(const uint8* src_y, int src_stride_y,

@@ -201,6 +499,7 @@

                         const uint8* u_buf,

                         const uint8* v_buf,

                         uint8* rgb_buf,

+                        const struct YuvConstants* yuvconstants,

                         int width) = I411ToARGBRow_C;

   if (!src_y || !src_u || !src_v ||

       !dst_argb ||

@@ -248,7 +547,7 @@

 #endif

   for (y = 0; y < height; ++y) {

-    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);

+    I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);

     dst_argb += dst_stride_argb;

     src_y += src_stride_y;

     src_u += src_stride_u;

@@ -257,6 +556,143 @@

   return 0;

+// Convert I420 with Alpha to preattenuated ARGB.

+static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,

+                                 const uint8* src_u, int src_stride_u,

+                                 const uint8* src_v, int src_stride_v,

+                                 const uint8* src_a, int src_stride_a,

+                                 uint8* dst_argb, int dst_stride_argb,

+                                 const struct YuvConstants* yuvconstants,

+                                 int width, int height, int attenuate) {

+  int y;

+  void (*I422AlphaToARGBRow)(const uint8* y_buf,

+                             const uint8* u_buf,

+                             const uint8* v_buf,

+                             const uint8* a_buf,

+                             uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

+                             int width) = I422AlphaToARGBRow_C;

+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,

+                           int width) = ARGBAttenuateRow_C;

+  if (!src_y || !src_u || !src_v || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_I422ALPHATOARGBROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_I422ALPHATOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_I422ALPHATOARGBROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

+    I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;

+  }

+#endif

+#if defined(HAS_ARGBATTENUATEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBATTENUATEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBATTENUATEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,

+                       width);

+    if (attenuate) {

+      ARGBAttenuateRow(dst_argb, dst_argb, width);

+    }

+    dst_argb += dst_stride_argb;

+    src_a += src_stride_a;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 with Alpha to ARGB.

+LIBYUV_API

+int I420AlphaToARGB(const uint8* src_y, int src_stride_y,

+                    const uint8* src_u, int src_stride_u,

+                    const uint8* src_v, int src_stride_v,

+                    const uint8* src_a, int src_stride_a,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height, int attenuate) {

+  return I420AlphaToARGBMatrix(src_y, src_stride_y,

+                               src_u, src_stride_u,

+                               src_v, src_stride_v,

+                               src_a, src_stride_a,

+                               dst_argb, dst_stride_argb,

+                               &kYuvI601Constants,

+                               width, height, attenuate);

+}

+// Convert I420 with Alpha to ABGR.

+LIBYUV_API

+int I420AlphaToABGR(const uint8* src_y, int src_stride_y,

+                    const uint8* src_u, int src_stride_u,

+                    const uint8* src_v, int src_stride_v,

+                    const uint8* src_a, int src_stride_a,

+                    uint8* dst_abgr, int dst_stride_abgr,

+                    int width, int height, int attenuate) {

+  return I420AlphaToARGBMatrix(src_y, src_stride_y,

+                               src_v, src_stride_v,  // Swap U and V

+                               src_u, src_stride_u,

+                               src_a, src_stride_a,

+                               dst_abgr, dst_stride_abgr,

+                               &kYvuI601Constants,  // Use Yvu matrix

+                               width, height, attenuate);

+}

 // Convert I400 to ARGB.

 LIBYUV_API

 int I400ToARGB(const uint8* src_y, int src_stride_y,

@@ -322,7 +758,7 @@

                uint8* dst_argb, int dst_stride_argb,

                int width, int height) {

   int y;

-  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =

+  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =

       J400ToARGBRow_C;

   if (!src_y || !dst_argb ||

       width <= 0 || height == 0) {

@@ -449,7 +885,7 @@

                 uint8* dst_argb, int dst_stride_argb,

                 int width, int height) {

   int y;

-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

       RGB24ToARGBRow_C;

   if (!src_rgb24 || !dst_argb ||

       width <= 0 || height == 0) {

@@ -499,7 +935,7 @@

               uint8* dst_argb, int dst_stride_argb,

               int width, int height) {

   int y;

-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

       RAWToARGBRow_C;

   if (!src_raw || !dst_argb ||

       width <= 0 || height == 0) {

@@ -549,7 +985,7 @@

                  uint8* dst_argb, int dst_stride_argb,

                  int width, int height) {

   int y;

-  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =

+  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =

       RGB565ToARGBRow_C;

   if (!src_rgb565 || !dst_argb ||

       width <= 0 || height == 0) {

@@ -608,7 +1044,7 @@

                    int width, int height) {

   int y;

   void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,

-      int pix) = ARGB1555ToARGBRow_C;

+      int width) = ARGB1555ToARGBRow_C;

   if (!src_argb1555 || !dst_argb ||

       width <= 0 || height == 0) {

     return -1;

@@ -666,7 +1102,7 @@

                    int width, int height) {

   int y;

   void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,

-      int pix) = ARGB4444ToARGBRow_C;

+      int width) = ARGB4444ToARGBRow_C;

   if (!src_argb4444 || !dst_argb ||

       width <= 0 || height == 0) {

     return -1;

@@ -727,6 +1163,7 @@

   void (*NV12ToARGBRow)(const uint8* y_buf,

                         const uint8* uv_buf,

                         uint8* rgb_buf,

+                        const struct YuvConstants* yuvconstants,

                         int width) = NV12ToARGBRow_C;

   if (!src_y || !src_uv || !dst_argb ||

       width <= 0 || height == 0) {

@@ -764,7 +1201,7 @@

 #endif

   for (y = 0; y < height; ++y) {

-    NV12ToARGBRow(src_y, src_uv, dst_argb, width);

+    NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);

     dst_argb += dst_stride_argb;

     src_y += src_stride_y;

     if (y & 1) {

@@ -784,6 +1221,7 @@

   void (*NV21ToARGBRow)(const uint8* y_buf,

                         const uint8* uv_buf,

                         uint8* rgb_buf,

+                        const struct YuvConstants* yuvconstants,

                         int width) = NV21ToARGBRow_C;

   if (!src_y || !src_uv || !dst_argb ||

       width <= 0 || height == 0) {

@@ -821,7 +1259,7 @@

 #endif

   for (y = 0; y < height; ++y) {

-    NV21ToARGBRow(src_y, src_uv, dst_argb, width);

+    NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);

     dst_argb += dst_stride_argb;

     src_y += src_stride_y;

     if (y & 1) {

@@ -840,6 +1278,7 @@

   void (*NV12ToARGBRow)(const uint8* y_buf,

                         const uint8* uv_buf,

                         uint8* rgb_buf,

+                        const struct YuvConstants* yuvconstants,

                         int width) = NV12ToARGBRow_C;

   if (!src_m420 || !dst_argb ||

       width <= 0 || height == 0) {

@@ -877,14 +1316,16 @@

 #endif

   for (y = 0; y < height - 1; y += 2) {

-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);

+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,

+                  &kYuvI601Constants, width);

     NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,

-                  dst_argb + dst_stride_argb, width);

+                  dst_argb + dst_stride_argb, &kYuvI601Constants, width);

     dst_argb += dst_stride_argb * 2;

     src_m420 += src_stride_m420 * 3;

   if (height & 1) {

-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);

+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,

+                  &kYuvI601Constants, width);

   return 0;

@@ -895,7 +1336,10 @@

                uint8* dst_argb, int dst_stride_argb,

                int width, int height) {

   int y;

-  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =

+  void (*YUY2ToARGBRow)(const uint8* src_yuy2,

+                        uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

+                        int width) =

       YUY2ToARGBRow_C;

   if (!src_yuy2 || !dst_argb ||

       width <= 0 || height == 0) {

@@ -939,7 +1383,7 @@

 #endif

   for (y = 0; y < height; ++y) {

-    YUY2ToARGBRow(src_yuy2, dst_argb, width);

+    YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);

     src_yuy2 += src_stride_yuy2;

     dst_argb += dst_stride_argb;

@@ -952,7 +1396,10 @@

                uint8* dst_argb, int dst_stride_argb,

                int width, int height) {

   int y;

-  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =

+  void (*UYVYToARGBRow)(const uint8* src_uyvy,

+                        uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

+                        int width) =

       UYVYToARGBRow_C;

   if (!src_uyvy || !dst_argb ||

       width <= 0 || height == 0) {

@@ -996,155 +1443,9 @@

 #endif

   for (y = 0; y < height; ++y) {

-    UYVYToARGBRow(src_uyvy, dst_argb, width);

+    UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);

     src_uyvy += src_stride_uyvy;

     dst_argb += dst_stride_argb;

-  }

-  return 0;

-}

-// Convert J420 to ARGB.

-LIBYUV_API

-int J420ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  int y;

-  void (*J422ToARGBRow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        int width) = J422ToARGBRow_C;

-  if (!src_y || !src_u || !src_v || !dst_argb ||

-      width <= 0 || height == 0) {

-    return -1;

-  }

-  // Negative height means invert the image.

-  if (height < 0) {

-    height = -height;

-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

-    dst_stride_argb = -dst_stride_argb;

-  }

-#if defined(HAS_J422TOARGBROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

-    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;

-    if (IS_ALIGNED(width, 8)) {

-      J422ToARGBRow = J422ToARGBRow_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_J422TOARGBROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2)) {

-    J422ToARGBRow = J422ToARGBRow_Any_AVX2;

-    if (IS_ALIGNED(width, 16)) {

-      J422ToARGBRow = J422ToARGBRow_AVX2;

-    }

-  }

-#endif

-#if defined(HAS_J422TOARGBROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    J422ToARGBRow = J422ToARGBRow_Any_NEON;

-    if (IS_ALIGNED(width, 8)) {

-      J422ToARGBRow = J422ToARGBRow_NEON;

-    }

-  }

-#endif

-#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;

-  }

-#endif

-  for (y = 0; y < height; ++y) {

-    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);

-    dst_argb += dst_stride_argb;

-    src_y += src_stride_y;

-    if (y & 1) {

-      src_u += src_stride_u;

-      src_v += src_stride_v;

-    }

-  }

-  return 0;

-}

-// Convert J422 to ARGB.

-LIBYUV_API

-int J422ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  int y;

-  void (*J422ToARGBRow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        int width) = J422ToARGBRow_C;

-  if (!src_y || !src_u || !src_v ||

-      !dst_argb ||

-      width <= 0 || height == 0) {

-    return -1;

-  }

-  // Negative height means invert the image.

-  if (height < 0) {

-    height = -height;

-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

-    dst_stride_argb = -dst_stride_argb;

-  }

-  // Coalesce rows.

-  if (src_stride_y == width &&

-      src_stride_u * 2 == width &&

-      src_stride_v * 2 == width &&

-      dst_stride_argb == width * 4) {

-    width *= height;

-    height = 1;

-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;

-  }

-#if defined(HAS_J422TOARGBROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

-    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;

-    if (IS_ALIGNED(width, 8)) {

-      J422ToARGBRow = J422ToARGBRow_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_J422TOARGBROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2)) {

-    J422ToARGBRow = J422ToARGBRow_Any_AVX2;

-    if (IS_ALIGNED(width, 16)) {

-      J422ToARGBRow = J422ToARGBRow_AVX2;

-    }

-  }

-#endif

-#if defined(HAS_J422TOARGBROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    J422ToARGBRow = J422ToARGBRow_Any_NEON;

-    if (IS_ALIGNED(width, 8)) {

-      J422ToARGBRow = J422ToARGBRow_NEON;

-    }

-  }

-#endif

-#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;

-  }

-#endif

-  for (y = 0; y < height; ++y) {

-    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);

-    dst_argb += dst_stride_argb;

-    src_y += src_stride_y;

-    src_u += src_stride_u;

-    src_v += src_stride_v;

   return 0;

--- a/third_party/libyuv/source/convert_from.cc

+++ b/third_party/libyuv/source/convert_from.cc

@@ -445,25 +445,26 @@

   return I420ToNV12(src_y, src_stride_y,

                     src_v, src_stride_v,

                     src_u, src_stride_u,

-                    dst_y, src_stride_y,

+                    dst_y, dst_stride_y,

                     dst_vu, dst_stride_vu,

                     width, height);

-// Convert I420 to ARGB.

-LIBYUV_API

-int I420ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+// Convert I422 to RGBA with matrix

+static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,

+                            const uint8* src_u, int src_stride_u,

+                            const uint8* src_v, int src_stride_v,

+                            uint8* dst_rgba, int dst_stride_rgba,

+                            const struct YuvConstants* yuvconstants,

+                            int width, int height) {

   int y;

-  void (*I422ToARGBRow)(const uint8* y_buf,

+  void (*I422ToRGBARow)(const uint8* y_buf,

                         const uint8* u_buf,

                         const uint8* v_buf,

                         uint8* rgb_buf,

-                        int width) = I422ToARGBRow_C;

-  if (!src_y || !src_u || !src_v || !dst_argb ||

+                        const struct YuvConstants* yuvconstants,

+                        int width) = I422ToRGBARow_C;

+  if (!src_y || !src_u || !src_v || !dst_rgba ||

       width <= 0 || height == 0) {

     return -1;

@@ -470,46 +471,46 @@

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

-    dst_stride_argb = -dst_stride_argb;

+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;

+    dst_stride_rgba = -dst_stride_rgba;

-#if defined(HAS_I422TOARGBROW_SSSE3)

+#if defined(HAS_I422TORGBAROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;

+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;

     if (IS_ALIGNED(width, 8)) {

-      I422ToARGBRow = I422ToARGBRow_SSSE3;

+      I422ToRGBARow = I422ToRGBARow_SSSE3;

 #endif

-#if defined(HAS_I422TOARGBROW_AVX2)

+#if defined(HAS_I422TORGBAROW_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;

+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;

     if (IS_ALIGNED(width, 16)) {

-      I422ToARGBRow = I422ToARGBRow_AVX2;

+      I422ToRGBARow = I422ToRGBARow_AVX2;

 #endif

-#if defined(HAS_I422TOARGBROW_NEON)

+#if defined(HAS_I422TORGBAROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

-    I422ToARGBRow = I422ToARGBRow_Any_NEON;

+    I422ToRGBARow = I422ToRGBARow_Any_NEON;

     if (IS_ALIGNED(width, 8)) {

-      I422ToARGBRow = I422ToARGBRow_NEON;

+      I422ToRGBARow = I422ToRGBARow_NEON;

 #endif

-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

+#if defined(HAS_I422TORGBAROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

       IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

       IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

       IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;

+      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {

+    I422ToRGBARow = I422ToRGBARow_DSPR2;

 #endif

   for (y = 0; y < height; ++y) {

-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);

-    dst_argb += dst_stride_argb;

+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);

+    dst_rgba += dst_stride_rgba;

     src_y += src_stride_y;

     if (y & 1) {

       src_u += src_stride_u;

@@ -519,207 +520,49 @@

   return 0;

-// Convert I420 to BGRA.

+// Convert I420 to RGBA.

 LIBYUV_API

-int I420ToBGRA(const uint8* src_y, int src_stride_y,

+int I420ToRGBA(const uint8* src_y, int src_stride_y,

                const uint8* src_u, int src_stride_u,

                const uint8* src_v, int src_stride_v,

-               uint8* dst_bgra, int dst_stride_bgra,

+               uint8* dst_rgba, int dst_stride_rgba,

                int width, int height) {

-  int y;

-  void (*I422ToBGRARow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        int width) = I422ToBGRARow_C;

-  if (!src_y || !src_u || !src_v || !dst_bgra ||

-      width <= 0 || height == 0) {

-    return -1;

-  }

-  // Negative height means invert the image.

-  if (height < 0) {

-    height = -height;

-    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;

-    dst_stride_bgra = -dst_stride_bgra;

-  }

-#if defined(HAS_I422TOBGRAROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

-    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToBGRARow = I422ToBGRARow_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_I422TOBGRAROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2)) {

-    I422ToBGRARow = I422ToBGRARow_Any_AVX2;

-    if (IS_ALIGNED(width, 16)) {

-      I422ToBGRARow = I422ToBGRARow_AVX2;

-    }

-  }

-#endif

-#if defined(HAS_I422TOBGRAROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    I422ToBGRARow = I422ToBGRARow_Any_NEON;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToBGRARow = I422ToBGRARow_NEON;

-    }

-  }

-#endif

-#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {

-    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;

-  }

-#endif

-  for (y = 0; y < height; ++y) {

-    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);

-    dst_bgra += dst_stride_bgra;

-    src_y += src_stride_y;

-    if (y & 1) {

-      src_u += src_stride_u;

-      src_v += src_stride_v;

-    }

-  }

-  return 0;

+  return I420ToRGBAMatrix(src_y, src_stride_y,

+                          src_u, src_stride_u,

+                          src_v, src_stride_v,

+                          dst_rgba, dst_stride_rgba,

+                          &kYuvI601Constants,

+                          width, height);

-// Convert I420 to ABGR.

+// Convert I420 to BGRA.

 LIBYUV_API

-int I420ToABGR(const uint8* src_y, int src_stride_y,

+int I420ToBGRA(const uint8* src_y, int src_stride_y,

                const uint8* src_u, int src_stride_u,

                const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

+               uint8* dst_bgra, int dst_stride_bgra,

                int width, int height) {

-  int y;

-  void (*I422ToABGRRow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        int width) = I422ToABGRRow_C;

-  if (!src_y || !src_u || !src_v || !dst_abgr ||

-      width <= 0 || height == 0) {

-    return -1;

-  }

-  // Negative height means invert the image.

-  if (height < 0) {

-    height = -height;

-    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;

-    dst_stride_abgr = -dst_stride_abgr;

-  }

-#if defined(HAS_I422TOABGRROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

-    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToABGRRow = I422ToABGRRow_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_I422TOABGRROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2)) {

-    I422ToABGRRow = I422ToABGRRow_Any_AVX2;

-    if (IS_ALIGNED(width, 16)) {

-      I422ToABGRRow = I422ToABGRRow_AVX2;

-    }

-  }

-#endif

-#if defined(HAS_I422TOABGRROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    I422ToABGRRow = I422ToABGRRow_Any_NEON;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToABGRRow = I422ToABGRRow_NEON;

-    }

-  }

-#endif

-  for (y = 0; y < height; ++y) {

-    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);

-    dst_abgr += dst_stride_abgr;

-    src_y += src_stride_y;

-    if (y & 1) {

-      src_u += src_stride_u;

-      src_v += src_stride_v;

-    }

-  }

-  return 0;

+  return I420ToRGBAMatrix(src_y, src_stride_y,

+                          src_v, src_stride_v,  // Swap U and V

+                          src_u, src_stride_u,

+                          dst_bgra, dst_stride_bgra,

+                          &kYvuI601Constants,  // Use Yvu matrix

+                          width, height);

-// Convert I420 to RGBA.

-LIBYUV_API

-int I420ToRGBA(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_rgba, int dst_stride_rgba,

-               int width, int height) {

+// Convert I420 to RGB24 with matrix

+static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,

+                             const uint8* src_u, int src_stride_u,

+                             const uint8* src_v, int src_stride_v,

+                             uint8* dst_rgb24, int dst_stride_rgb24,

+                             const struct YuvConstants* yuvconstants,

+                             int width, int height) {

   int y;

-  void (*I422ToRGBARow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        int width) = I422ToRGBARow_C;

-  if (!src_y || !src_u || !src_v || !dst_rgba ||

-      width <= 0 || height == 0) {

-    return -1;

-  }

-  // Negative height means invert the image.

-  if (height < 0) {

-    height = -height;

-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;

-    dst_stride_rgba = -dst_stride_rgba;

-  }

-#if defined(HAS_I422TORGBAROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToRGBARow = I422ToRGBARow_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_I422TORGBAROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2)) {

-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;

-    if (IS_ALIGNED(width, 16)) {

-      I422ToRGBARow = I422ToRGBARow_AVX2;

-    }

-  }

-#endif

-#if defined(HAS_I422TORGBAROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    I422ToRGBARow = I422ToRGBARow_Any_NEON;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToRGBARow = I422ToRGBARow_NEON;

-    }

-  }

-#endif

-  for (y = 0; y < height; ++y) {

-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);

-    dst_rgba += dst_stride_rgba;

-    src_y += src_stride_y;

-    if (y & 1) {

-      src_u += src_stride_u;

-      src_v += src_stride_v;

-    }

-  }

-  return 0;

-}

-// Convert I420 to RGB24.

-LIBYUV_API

-int I420ToRGB24(const uint8* src_y, int src_stride_y,

-                const uint8* src_u, int src_stride_u,

-                const uint8* src_v, int src_stride_v,

-                uint8* dst_rgb24, int dst_stride_rgb24,

-                int width, int height) {

-  int y;

   void (*I422ToRGB24Row)(const uint8* y_buf,

                          const uint8* u_buf,

                          const uint8* v_buf,

                          uint8* rgb_buf,

+                         const struct YuvConstants* yuvconstants,

                          int width) = I422ToRGB24Row_C;

   if (!src_y || !src_u || !src_v || !dst_rgb24 ||

       width <= 0 || height == 0) {

@@ -757,7 +600,7 @@

 #endif

   for (y = 0; y < height; ++y) {

-    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);

+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);

     dst_rgb24 += dst_stride_rgb24;

     src_y += src_stride_y;

     if (y & 1) {

@@ -768,64 +611,34 @@

   return 0;

-// Convert I420 to RAW.

+// Convert I420 to RGB24.

 LIBYUV_API

-int I420ToRAW(const uint8* src_y, int src_stride_y,

+int I420ToRGB24(const uint8* src_y, int src_stride_y,

                 const uint8* src_u, int src_stride_u,

                 const uint8* src_v, int src_stride_v,

-                uint8* dst_raw, int dst_stride_raw,

+                uint8* dst_rgb24, int dst_stride_rgb24,

                 int width, int height) {

-  int y;

-  void (*I422ToRAWRow)(const uint8* y_buf,

-                       const uint8* u_buf,

-                       const uint8* v_buf,

-                       uint8* rgb_buf,

-                       int width) = I422ToRAWRow_C;

-  if (!src_y || !src_u || !src_v || !dst_raw ||

-      width <= 0 || height == 0) {

-    return -1;

-  }

-  // Negative height means invert the image.

-  if (height < 0) {

-    height = -height;

-    dst_raw = dst_raw + (height - 1) * dst_stride_raw;

-    dst_stride_raw = -dst_stride_raw;

-  }

-#if defined(HAS_I422TORAWROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

-    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToRAWRow = I422ToRAWRow_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_I422TORAWROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2)) {

-    I422ToRAWRow = I422ToRAWRow_Any_AVX2;

-    if (IS_ALIGNED(width, 16)) {

-      I422ToRAWRow = I422ToRAWRow_AVX2;

-    }

-  }

-#endif

-#if defined(HAS_I422TORAWROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    I422ToRAWRow = I422ToRAWRow_Any_NEON;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToRAWRow = I422ToRAWRow_NEON;

-    }

-  }

-#endif

+  return I420ToRGB24Matrix(src_y, src_stride_y,

+                           src_u, src_stride_u,

+                           src_v, src_stride_v,

+                           dst_rgb24, dst_stride_rgb24,

+                           &kYuvI601Constants,

+                           width, height);

+}

-  for (y = 0; y < height; ++y) {

-    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);

-    dst_raw += dst_stride_raw;

-    src_y += src_stride_y;

-    if (y & 1) {

-      src_u += src_stride_u;

-      src_v += src_stride_v;

-    }

-  }

-  return 0;

+// Convert I420 to RAW.

+LIBYUV_API

+int I420ToRAW(const uint8* src_y, int src_stride_y,

+              const uint8* src_u, int src_stride_u,

+              const uint8* src_v, int src_stride_v,

+              uint8* dst_raw, int dst_stride_raw,

+              int width, int height) {

+  return I420ToRGB24Matrix(src_y, src_stride_y,

+                           src_v, src_stride_v,  // Swap U and V

+                           src_u, src_stride_u,

+                           dst_raw, dst_stride_raw,

+                           &kYvuI601Constants,  // Use Yvu matrix

+                           width, height);

 // Convert I420 to ARGB1555.

@@ -840,6 +653,7 @@

                             const uint8* u_buf,

                             const uint8* v_buf,

                             uint8* rgb_buf,

+                            const struct YuvConstants* yuvconstants,

                             int width) = I422ToARGB1555Row_C;

   if (!src_y || !src_u || !src_v || !dst_argb1555 ||

       width <= 0 || height == 0) {

@@ -877,7 +691,8 @@

 #endif

   for (y = 0; y < height; ++y) {

-    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);

+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,

+                      width);

     dst_argb1555 += dst_stride_argb1555;

     src_y += src_stride_y;

     if (y & 1) {

@@ -901,6 +716,7 @@

                             const uint8* u_buf,

                             const uint8* v_buf,

                             uint8* rgb_buf,

+                            const struct YuvConstants* yuvconstants,

                             int width) = I422ToARGB4444Row_C;

   if (!src_y || !src_u || !src_v || !dst_argb4444 ||

       width <= 0 || height == 0) {

@@ -938,7 +754,8 @@

 #endif

   for (y = 0; y < height; ++y) {

-    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);

+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,

+                      width);

     dst_argb4444 += dst_stride_argb4444;

     src_y += src_stride_y;

     if (y & 1) {

@@ -961,6 +778,7 @@

                           const uint8* u_buf,

                           const uint8* v_buf,

                           uint8* rgb_buf,

+                          const struct YuvConstants* yuvconstants,

                           int width) = I422ToRGB565Row_C;

   if (!src_y || !src_u || !src_v || !dst_rgb565 ||

       width <= 0 || height == 0) {

@@ -998,7 +816,7 @@

 #endif

   for (y = 0; y < height; ++y) {

-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);

+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);

     dst_rgb565 += dst_stride_rgb565;

     src_y += src_stride_y;

     if (y & 1) {

@@ -1029,9 +847,10 @@

                         const uint8* u_buf,

                         const uint8* v_buf,

                         uint8* rgb_buf,

+                        const struct YuvConstants* yuvconstants,

                         int width) = I422ToARGBRow_C;

   void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,

-      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;

+      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;

   if (!src_y || !src_u || !src_v || !dst_rgb565 ||

       width <= 0 || height == 0) {

     return -1;

@@ -1069,12 +888,12 @@

 #endif

-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

+#if defined(HAS_I422TOARGBROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

       IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

       IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

       IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {

-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;

+    I422ToARGBRow = I422ToARGBRow_DSPR2;

 #endif

 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)

@@ -1105,7 +924,7 @@

     // Allocate a row of argb.

     align_buffer_64(row_argb, width * 4);

     for (y = 0; y < height; ++y) {

-      I422ToARGBRow(src_y, src_u, src_v, row_argb, width);

+      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);

       ARGBToRGB565DitherRow(row_argb, dst_rgb565,

                             *(uint32*)(dither4x4 + ((y & 3) << 2)), width);

       dst_rgb565 += dst_stride_rgb565;

@@ -1258,7 +1077,6 @@

     // Triplanar formats

     // TODO(fbarchard): halfstride instead of halfwidth

     case FOURCC_I420:

-    case FOURCC_YU12:

     case FOURCC_YV12: {

       int halfwidth = (width + 1) / 2;

       int halfheight = (height + 1) / 2;

--- a/third_party/libyuv/source/convert_from_argb.cc

+++ b/third_party/libyuv/source/convert_from_argb.cc

@@ -28,10 +28,10 @@

                uint8* dst_v, int dst_stride_v,

                int width, int height) {

   int y;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

   void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-      int pix) = ARGBToUV444Row_C;

+      int width) = ARGBToUV444Row_C;

   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

@@ -109,13 +109,16 @@

                uint8* dst_v, int dst_stride_v,

                int width, int height) {

   int y;

-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-      int pix) = ARGBToUV422Row_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

+  if (!src_argb ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

     return -1;

+  // Negative height means invert the image.

   if (height < 0) {

     height = -height;

     src_argb = src_argb + (height - 1) * src_stride_argb;

@@ -130,34 +133,22 @@

     height = 1;

     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;

-#if defined(HAS_ARGBTOUV422ROW_SSSE3)

+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;

-    if (IS_ALIGNED(width, 16)) {

-      ARGBToUV422Row = ARGBToUV422Row_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_ARGBTOUV422ROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;

-    if (IS_ALIGNED(width, 16)) {

-      ARGBToUV422Row = ARGBToUV422Row_NEON;

-    }

-  }

-#endif

-#if defined(HAS_ARGBTOYROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

     ARGBToYRow = ARGBToYRow_Any_SSSE3;

     if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_SSSE3;

       ARGBToYRow = ARGBToYRow_SSSE3;

 #endif

-#if defined(HAS_ARGBTOYROW_AVX2)

+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;

     ARGBToYRow = ARGBToYRow_Any_AVX2;

     if (IS_ALIGNED(width, 32)) {

+      ARGBToUVRow = ARGBToUVRow_AVX2;

       ARGBToYRow = ARGBToYRow_AVX2;

@@ -170,9 +161,17 @@

 #endif

+#if defined(HAS_ARGBTOUVROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    ARGBToUVRow = ARGBToUVRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_NEON;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

-    ARGBToUV422Row(src_argb, dst_u, dst_v, width);

+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);

     ARGBToYRow(src_argb, dst_y, width);

     src_argb += src_stride_argb;

     dst_y += dst_stride_y;

@@ -191,8 +190,8 @@

                int width, int height) {

   int y;

   void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-      int pix) = ARGBToUV411Row_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      int width) = ARGBToUV411Row_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

@@ -264,7 +263,7 @@

   int halfwidth = (width + 1) >> 1;

   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

                       int width) = MergeUVRow_C;

@@ -373,7 +372,7 @@

   int halfwidth = (width + 1) >> 1;

   void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

                       int width) = MergeUVRow_C;

@@ -478,9 +477,9 @@

                uint8* dst_yuy2, int dst_stride_yuy2,

                int width, int height) {

   int y;

-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-      int pix) = ARGBToUV422Row_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,

+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

   void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,

       const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;

@@ -502,34 +501,22 @@

     height = 1;

     src_stride_argb = dst_stride_yuy2 = 0;

-#if defined(HAS_ARGBTOUV422ROW_SSSE3)

+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;

-    if (IS_ALIGNED(width, 16)) {

-      ARGBToUV422Row = ARGBToUV422Row_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_ARGBTOUV422ROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;

-    if (IS_ALIGNED(width, 16)) {

-      ARGBToUV422Row = ARGBToUV422Row_NEON;

-    }

-  }

-#endif

-#if defined(HAS_ARGBTOYROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

     ARGBToYRow = ARGBToYRow_Any_SSSE3;

     if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_SSSE3;

       ARGBToYRow = ARGBToYRow_SSSE3;

 #endif

-#if defined(HAS_ARGBTOYROW_AVX2)

+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;

     ARGBToYRow = ARGBToYRow_Any_AVX2;

     if (IS_ALIGNED(width, 32)) {

+      ARGBToUVRow = ARGBToUVRow_AVX2;

       ARGBToYRow = ARGBToYRow_AVX2;

@@ -542,7 +529,14 @@

 #endif

+#if defined(HAS_ARGBTOUVROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    ARGBToUVRow = ARGBToUVRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_NEON;

+    }

+  }

+#endif

 #if defined(HAS_I422TOYUY2ROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;

@@ -567,7 +561,7 @@

     uint8* row_v = row_u + ((width + 63) & ~63) / 2;

     for (y = 0; y < height; ++y) {

-      ARGBToUV422Row(src_argb, row_u, row_v, width);

+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);

       ARGBToYRow(src_argb, row_y, width);

       I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);

       src_argb += src_stride_argb;

@@ -585,9 +579,9 @@

                uint8* dst_uyvy, int dst_stride_uyvy,

                int width, int height) {

   int y;

-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-      int pix) = ARGBToUV422Row_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,

+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

   void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,

       const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;

@@ -609,34 +603,22 @@

     height = 1;

     src_stride_argb = dst_stride_uyvy = 0;

-#if defined(HAS_ARGBTOUV422ROW_SSSE3)

+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;

-    if (IS_ALIGNED(width, 16)) {

-      ARGBToUV422Row = ARGBToUV422Row_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_ARGBTOUV422ROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;

-    if (IS_ALIGNED(width, 16)) {

-      ARGBToUV422Row = ARGBToUV422Row_NEON;

-    }

-  }

-#endif

-#if defined(HAS_ARGBTOYROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

     ARGBToYRow = ARGBToYRow_Any_SSSE3;

     if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_SSSE3;

       ARGBToYRow = ARGBToYRow_SSSE3;

 #endif

-#if defined(HAS_ARGBTOYROW_AVX2)

+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;

     ARGBToYRow = ARGBToYRow_Any_AVX2;

     if (IS_ALIGNED(width, 32)) {

+      ARGBToUVRow = ARGBToUVRow_AVX2;

       ARGBToYRow = ARGBToYRow_AVX2;

@@ -649,7 +631,14 @@

 #endif

+#if defined(HAS_ARGBTOUVROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    ARGBToUVRow = ARGBToUVRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_NEON;

+    }

+  }

+#endif

 #if defined(HAS_I422TOUYVYROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;

@@ -674,7 +663,7 @@

     uint8* row_v = row_u + ((width + 63) & ~63) / 2;

     for (y = 0; y < height; ++y) {

-      ARGBToUV422Row(src_argb, row_u, row_v, width);

+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);

       ARGBToYRow(src_argb, row_y, width);

       I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);

       src_argb += src_stride_argb;

@@ -692,7 +681,7 @@

                uint8* dst_y, int dst_stride_y,

                int width, int height) {

   int y;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

       ARGBToYRow_C;

   if (!src_argb || !dst_y || width <= 0 || height == 0) {

     return -1;

@@ -764,7 +753,7 @@

                 uint8* dst_rgb24, int dst_stride_rgb24,

                 int width, int height) {

   int y;

-  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =

+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =

       ARGBToRGB24Row_C;

   if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {

     return -1;

@@ -812,7 +801,7 @@

               uint8* dst_raw, int dst_stride_raw,

               int width, int height) {

   int y;

-  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =

+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =

       ARGBToRAWRow_C;

   if (!src_argb || !dst_raw || width <= 0 || height == 0) {

     return -1;

@@ -869,7 +858,7 @@

                        const uint8* dither4x4, int width, int height) {

   int y;

   void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,

-      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;

+      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;

   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {

     return -1;

@@ -921,7 +910,7 @@

                  uint8* dst_rgb565, int dst_stride_rgb565,

                  int width, int height) {

   int y;

-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =

+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =

       ARGBToRGB565Row_C;

   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {

     return -1;

@@ -977,7 +966,7 @@

                    uint8* dst_argb1555, int dst_stride_argb1555,

                    int width, int height) {

   int y;

-  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =

+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =

       ARGBToARGB1555Row_C;

   if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {

     return -1;

@@ -1033,7 +1022,7 @@

                    uint8* dst_argb4444, int dst_stride_argb4444,

                    int width, int height) {

   int y;

-  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =

+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =

       ARGBToARGB4444Row_C;

   if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {

     return -1;

@@ -1093,7 +1082,7 @@

   int y;

   void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,

                        uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;

-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =

+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =

       ARGBToYJRow_C;

   if (!src_argb ||

       !dst_yj || !dst_u || !dst_v ||

@@ -1157,21 +1146,24 @@

   return 0;

-// ARGB little endian (bgra in memory) to J422

+// Convert ARGB to J422. (JPeg full range I422).

 LIBYUV_API

 int ARGBToJ422(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

+               uint8* dst_yj, int dst_stride_yj,

                uint8* dst_u, int dst_stride_u,

                uint8* dst_v, int dst_stride_v,

                int width, int height) {

   int y;

-  void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-      int pix) = ARGBToUVJ422Row_C;

-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;

+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =

       ARGBToYJRow_C;

-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

+  if (!src_argb ||

+      !dst_yj || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

     return -1;

+  // Negative height means invert the image.

   if (height < 0) {

     height = -height;

     src_argb = src_argb + (height - 1) * src_stride_argb;

@@ -1179,34 +1171,19 @@

   // Coalesce rows.

   if (src_stride_argb == width * 4 &&

-      dst_stride_y == width &&

+      dst_stride_yj == width &&

       dst_stride_u * 2 == width &&

       dst_stride_v * 2 == width) {

     width *= height;

     height = 1;

-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;

+    src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;

-#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)

+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;

-    if (IS_ALIGNED(width, 16)) {

-      ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_ARGBTOUVJ422ROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;

-    if (IS_ALIGNED(width, 16)) {

-      ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;

-    }

-  }

-#endif

-#if defined(HAS_ARGBTOYJROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;

     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;

     if (IS_ALIGNED(width, 16)) {

+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;

       ARGBToYJRow = ARGBToYJRow_SSSE3;

@@ -1227,12 +1204,20 @@

 #endif

+#if defined(HAS_ARGBTOUVJROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVJRow = ARGBToUVJRow_NEON;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

-    ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);

-    ARGBToYJRow(src_argb, dst_y, width);

+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);

+    ARGBToYJRow(src_argb, dst_yj, width);

     src_argb += src_stride_argb;

-    dst_y += dst_stride_y;

+    dst_yj += dst_stride_yj;

     dst_u += dst_stride_u;

     dst_v += dst_stride_v;

@@ -1245,7 +1230,7 @@

                uint8* dst_yj, int dst_stride_yj,

                int width, int height) {

   int y;

-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =

+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =

       ARGBToYJRow_C;

   if (!src_argb || !dst_yj || width <= 0 || height == 0) {

     return -1;

--- a/third_party/libyuv/source/convert_jpeg.cc

+++ b/third_party/libyuv/source/convert_jpeg.cc

@@ -9,6 +9,7 @@

*/

 #include "libyuv/convert.h"

+#include "libyuv/convert_argb.h"

 #ifdef HAVE_JPEG

 #include "libyuv/mjpeg_decoder.h"

--- a/third_party/libyuv/source/convert_to_argb.cc

+++ b/third_party/libyuv/source/convert_to_argb.cc

@@ -23,7 +23,7 @@

 extern "C" {

 #endif

-// Convert camera sample to I420 with cropping, rotation and vertical flip.

+// Convert camera sample to ARGB with cropping, rotation and vertical flip.

 // src_width is used for source stride computation

 // src_height is used to compute location of planes, and indicate inversion

 // sample_size is measured in bytes and is the size of the frame.

@@ -51,8 +51,8 @@

   // also enable temporary buffer.

   LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||

       crop_argb == sample;

-  uint8* tmp_argb = crop_argb;

-  int tmp_argb_stride = argb_stride;

+  uint8* dest_argb = crop_argb;

+  int dest_argb_stride = argb_stride;

   uint8* rotate_buffer = NULL;

   int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

@@ -66,13 +66,13 @@

   if (need_buf) {

-    int argb_size = crop_width * abs_crop_height * 4;

+    int argb_size = crop_width * 4 * abs_crop_height;

     rotate_buffer = (uint8*)malloc(argb_size);

     if (!rotate_buffer) {

       return 1;  // Out of memory runtime error.

     crop_argb = rotate_buffer;

-    argb_stride = crop_width;

+    argb_stride = crop_width * 4;

   switch (format) {

@@ -176,7 +176,6 @@

       break;

     // Triplanar formats

     case FOURCC_I420:

-    case FOURCC_YU12:

     case FOURCC_YV12: {

       const uint8* src_y = sample + (src_width * crop_y + crop_x);

       const uint8* src_u;

@@ -291,7 +290,7 @@

   if (need_buf) {

     if (!r) {

       r = ARGBRotate(crop_argb, argb_stride,

-                     tmp_argb, tmp_argb_stride,

+                     dest_argb, dest_argb_stride,

                      crop_width, abs_crop_height, rotation);

     free(rotate_buffer);

--- a/third_party/libyuv/source/convert_to_i420.cc

+++ b/third_party/libyuv/source/convert_to_i420.cc

@@ -39,12 +39,13 @@

   int aligned_src_width = (src_width + 1) & ~1;

   const uint8* src;

   const uint8* src_uv;

-  int abs_src_height = (src_height < 0) ? -src_height : src_height;

-  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;

+  const int abs_src_height = (src_height < 0) ? -src_height : src_height;

+  // TODO(nisse): Why allow crop_height < 0?

+  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

   int r = 0;

   LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&

       format != FOURCC_NV12 && format != FOURCC_NV21 &&

-      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;

+      format != FOURCC_YV12) || y == sample;

   uint8* tmp_y = y;

   uint8* tmp_u = u;

   uint8* tmp_v = v;

@@ -52,7 +53,8 @@

   int tmp_u_stride = u_stride;

   int tmp_v_stride = v_stride;

   uint8* rotate_buffer = NULL;

-  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

+  const int inv_crop_height =

+      (src_height < 0) ? -abs_crop_height : abs_crop_height;

   if (!y || !u || !v || !sample ||

       src_width <= 0 || crop_width <= 0  ||

@@ -59,9 +61,6 @@

       src_height == 0 || crop_height == 0) {

     return -1;

-  if (src_height < 0) {

-    inv_crop_height = -inv_crop_height;

-  }

   // One pass rotation is available for some formats. For the rest, convert

   // to I420 (with optional vertical flipping) into a temporary I420 buffer,

@@ -214,7 +213,6 @@

       break;

     // Triplanar formats

     case FOURCC_I420:

-    case FOURCC_YU12:

     case FOURCC_YV12: {

       const uint8* src_y = sample + (src_width * crop_y + crop_x);

       const uint8* src_u;

--- a/third_party/libyuv/source/cpu_id.cc

+++ b/third_party/libyuv/source/cpu_id.cc

@@ -10,12 +10,12 @@

 #include "libyuv/cpu_id.h"

-#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)

+#if defined(_MSC_VER)

 #include <intrin.h>  // For __cpuidex()

 #endif

 #if !defined(__pnacl__) && !defined(__CLR_VER) && \

     !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \

-    defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)

+    defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)

 #include <immintrin.h>  // For _xgetbv()

 #endif

@@ -36,7 +36,8 @@

 // For functions that use the stack and have runtime checks for overflow,

 // use SAFEBUFFERS to avoid additional check.

-#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)

+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \

+    !defined(__clang__)

 #define SAFEBUFFERS __declspec(safebuffers)

 #else

 #define SAFEBUFFERS

@@ -48,9 +49,9 @@

     !defined(__pnacl__) && !defined(__CLR_VER)

 LIBYUV_API

 void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {

-#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)

+#if defined(_MSC_VER)

 // Visual C version uses intrinsic or inline x86 assembly.

-#if (_MSC_FULL_VER >= 160040219)

+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)

   __cpuidex((int*)(cpu_info), info_eax, info_ecx);

 #elif defined(_M_IX86)

   __asm {

@@ -63,7 +64,7 @@

     mov        [edi + 8], ecx

     mov        [edi + 12], edx

-#else

+#else  // Visual C but not x86

   if (info_ecx == 0) {

     __cpuid((int*)(cpu_info), info_eax);

   } else {

@@ -71,9 +72,9 @@

 #endif

 // GCC version uses inline x86 assembly.

-#else  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)

+#else  // defined(_MSC_VER)

   uint32 info_ebx, info_edx;

-  asm volatile (  // NOLINT

+  asm volatile (

 #if defined( __i386__) && defined(__PIC__)

     // Preserve ebx for fpic 32 bit.

     "mov %%ebx, %%edi                          \n"

@@ -89,7 +90,7 @@

   cpu_info[1] = info_ebx;

   cpu_info[2] = info_ecx;

   cpu_info[3] = info_edx;

-#endif  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)

+#endif  // defined(_MSC_VER)

 #else  // (defined(_M_IX86) || defined(_M_X64) ...

 LIBYUV_API

@@ -98,28 +99,37 @@

 #endif

-// TODO(fbarchard): Enable xgetbv when validator supports it.

+// For VS2010 and earlier emit can be used:

+//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.

+//  __asm {

+//    xor        ecx, ecx    // xcr 0

+//    xgetbv

+//    mov        xcr0, eax

+//  }

+// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.

+// https://code.google.com/p/libyuv/issues/detail?id=529

+#if defined(_M_IX86) && (_MSC_VER < 1900)

+#pragma optimize("g", off)

+#endif

 #if (defined(_M_IX86) || defined(_M_X64) || \

     defined(__i386__) || defined(__x86_64__)) && \

     !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)

 #define HAS_XGETBV

 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.

-int TestOsSaveYmm() {

+int GetXCR0() {

   uint32 xcr0 = 0u;

-#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)

+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)

   xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.

-#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)

-  __asm {

-    xor        ecx, ecx    // xcr 0

-    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.

-    mov        xcr0, eax

-  }

 #elif defined(__i386__) || defined(__x86_64__)

   asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");

 #endif  // defined(__i386__) || defined(__x86_64__)

-  return((xcr0 & 6) == 6);  // Is ymm saved?

+  return xcr0;

 #endif  // defined(_M_IX86) || defined(_M_X64) ..

+// Return optimization to previous setting.

+#if defined(_M_IX86) && (_MSC_VER < 1900)

+#pragma optimize("g", on)

+#endif

 // based on libvpx arm_cpudetect.c

 // For Arm, but public to allow testing on any CPU

@@ -151,30 +161,9 @@

   return 0;

-#if defined(__mips__) && defined(__linux__)

-static int MipsCpuCaps(const char* search_string) {

-  char cpuinfo_line[512];

-  const char* file_name = "/proc/cpuinfo";

-  FILE* f = fopen(file_name, "r");

-  if (!f) {

-    // Assume DSP if /proc/cpuinfo is unavailable.

-    // This will occur for Chrome sandbox for Pepper or Render process.

-    return kCpuHasMIPS_DSP;

-  }

-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {

-    if (strstr(cpuinfo_line, search_string) != NULL) {

-      fclose(f);

-      return kCpuHasMIPS_DSP;

-    }

-  }

-  fclose(f);

-  return 0;

-}

-#endif

 // CPU detect function for SIMD instruction sets.

 LIBYUV_API

-int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.

+int cpu_info_ = 0;  // cpu_info is not initialized yet.

 // Test environment variable for disabling CPU features. Any non-zero value

 // to disable. Zero ignored to make it easy to set the variable on/off.

@@ -197,8 +186,9 @@

 LIBYUV_API SAFEBUFFERS

 int InitCpuFlags(void) {

+  // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.

+  int cpu_info = 0;

 #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)

   uint32 cpu_info0[4] = { 0, 0, 0, 0 };

   uint32 cpu_info1[4] = { 0, 0, 0, 0 };

   uint32 cpu_info7[4] = { 0, 0, 0, 0 };

@@ -207,67 +197,67 @@

   if (cpu_info0[0] >= 7) {

     CpuId(7, 0, cpu_info7);

-  cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |

-              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |

-              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |

-              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |

-              ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |

-              ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |

-              kCpuHasX86;

+  cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |

+             ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |

+             ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |

+             ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |

+             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |

+             ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |

+             kCpuHasX86;

 #ifdef HAS_XGETBV

-  if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave

-      TestOsSaveYmm()) {  // Saves YMM.

-    cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |

-                 kCpuHasAVX;

+  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv

+  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave

+      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers

+    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;

+    // Detect AVX512bw

+    if ((GetXCR0() & 0xe0) == 0xe0) {

+      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;

+    }

 #endif

   // Environment variable overrides for testing.

   if (TestEnv("LIBYUV_DISABLE_X86")) {

-    cpu_info_ &= ~kCpuHasX86;

+    cpu_info &= ~kCpuHasX86;

   if (TestEnv("LIBYUV_DISABLE_SSE2")) {

-    cpu_info_ &= ~kCpuHasSSE2;

+    cpu_info &= ~kCpuHasSSE2;

   if (TestEnv("LIBYUV_DISABLE_SSSE3")) {

-    cpu_info_ &= ~kCpuHasSSSE3;

+    cpu_info &= ~kCpuHasSSSE3;

   if (TestEnv("LIBYUV_DISABLE_SSE41")) {

-    cpu_info_ &= ~kCpuHasSSE41;

+    cpu_info &= ~kCpuHasSSE41;

   if (TestEnv("LIBYUV_DISABLE_SSE42")) {

-    cpu_info_ &= ~kCpuHasSSE42;

+    cpu_info &= ~kCpuHasSSE42;

   if (TestEnv("LIBYUV_DISABLE_AVX")) {

-    cpu_info_ &= ~kCpuHasAVX;

+    cpu_info &= ~kCpuHasAVX;

   if (TestEnv("LIBYUV_DISABLE_AVX2")) {

-    cpu_info_ &= ~kCpuHasAVX2;

+    cpu_info &= ~kCpuHasAVX2;

   if (TestEnv("LIBYUV_DISABLE_ERMS")) {

-    cpu_info_ &= ~kCpuHasERMS;

+    cpu_info &= ~kCpuHasERMS;

   if (TestEnv("LIBYUV_DISABLE_FMA3")) {

-    cpu_info_ &= ~kCpuHasFMA3;

+    cpu_info &= ~kCpuHasFMA3;

+  if (TestEnv("LIBYUV_DISABLE_AVX3")) {

+    cpu_info &= ~kCpuHasAVX3;

+  }

 #endif

 #if defined(__mips__) && defined(__linux__)

-  // Linux mips parse text file for dsp detect.

-  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.

 #if defined(__mips_dspr2)

-  cpu_info_ |= kCpuHasMIPS_DSPR2;

+  cpu_info |= kCpuHasDSPR2;

 #endif

-  cpu_info_ |= kCpuHasMIPS;

-  if (getenv("LIBYUV_DISABLE_MIPS")) {

-    cpu_info_ &= ~kCpuHasMIPS;

+  cpu_info |= kCpuHasMIPS;

+  if (getenv("LIBYUV_DISABLE_DSPR2")) {

+    cpu_info &= ~kCpuHasDSPR2;

-  if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {

-    cpu_info_ &= ~kCpuHasMIPS_DSP;

-  }

-  if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {

-    cpu_info_ &= ~kCpuHasMIPS_DSPR2;

-  }

 #endif

 #if defined(__arm__) || defined(__aarch64__)

 // gcc -mfpu=neon defines __ARM_NEON__

@@ -274,28 +264,31 @@

 // __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.

 // For Linux, /proc/cpuinfo can be tested but without that assume Neon.

 #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)

-  cpu_info_ = kCpuHasNEON;

+  cpu_info = kCpuHasNEON;

 // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon

 // flag in it.

 // So for aarch64, neon enabling is hard coded here.

 #endif

 #if defined(__aarch64__)

-  cpu_info_ = kCpuHasNEON;

+  cpu_info = kCpuHasNEON;

 #else

   // Linux arm parse text file for neon detect.

-  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");

+  cpu_info = ArmCpuCaps("/proc/cpuinfo");

 #endif

-  cpu_info_ |= kCpuHasARM;

+  cpu_info |= kCpuHasARM;

   if (TestEnv("LIBYUV_DISABLE_NEON")) {

-    cpu_info_ &= ~kCpuHasNEON;

+    cpu_info &= ~kCpuHasNEON;

 #endif  // __arm__

   if (TestEnv("LIBYUV_DISABLE_ASM")) {

-    cpu_info_ = 0;

+    cpu_info = 0;

-  return cpu_info_;

+  cpu_info  |= kCpuInitialized;

+  cpu_info_ = cpu_info;

+  return cpu_info;

+// Note that use of this function is not thread safe.

 LIBYUV_API

 void MaskCpuFlags(int enable_flags) {

   cpu_info_ = InitCpuFlags() & enable_flags;

--- a/third_party/libyuv/source/mjpeg_decoder.cc

+++ b/third_party/libyuv/source/mjpeg_decoder.cc

@@ -59,8 +59,7 @@

 // Methods that are passed to jpeglib.

 boolean fill_input_buffer(jpeg_decompress_struct* cinfo);

 void init_source(jpeg_decompress_struct* cinfo);

-void skip_input_data(jpeg_decompress_struct* cinfo,

-                     long num_bytes);  // NOLINT

+void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes);  // NOLINT

 void term_source(jpeg_decompress_struct* cinfo);

 void ErrorHandler(jpeg_common_struct* cinfo);

@@ -429,8 +428,7 @@

   return TRUE;

-void skip_input_data(j_decompress_ptr cinfo,

-                     long num_bytes) {  // NOLINT

+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT

   cinfo->src->next_input_byte += num_bytes;

--- a/third_party/libyuv/source/mjpeg_validate.cc

+++ b/third_party/libyuv/source/mjpeg_validate.cc

@@ -17,51 +17,22 @@

 extern "C" {

 #endif

-// Enable this to try scasb implementation.

-// #define ENABLE_SCASB 1

-#ifdef ENABLE_SCASB

-// Multiple of 1.

-__declspec(naked)

-const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {

-  __asm {

-    mov        edx, edi

-    mov        edi, [esp + 4]   // src

-    mov        eax, [esp + 8]   // val

-    mov        ecx, [esp + 12]  // count

-    repne scasb

-    jne        sr99

-    mov        eax, edi

-    sub        eax, 1

-    mov        edi, edx

-    ret

-  sr99:

-    mov        eax, 0

-    mov        edi, edx

-    ret

-  }

-}

-#endif

-// Helper function to scan for EOI marker.

+// Helper function to scan for EOI marker (0xff 0xd9).

 static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {

-  const uint8* end = sample + sample_size - 1;

-  const uint8* it = sample;

-  for (;;) {

-#ifdef ENABLE_SCASB

-    it = ScanRow_ERMS(it, 0xff, end - it);

-#else

-    it = static_cast<const uint8*>(memchr(it, 0xff, end - it));

-#endif

-    if (it == NULL) {

-      break;

+  if (sample_size >= 2) {

+    const uint8* end = sample + sample_size - 1;

+    const uint8* it = sample;

+    while (it < end) {

+      // TODO(fbarchard): scan for 0xd9 instead.

+      it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));

+      if (it == NULL) {

+        break;

+      }

+      if (it[1] == 0xd9) {

+        return LIBYUV_TRUE;  // Success: Valid jpeg.

+      }

+      ++it;  // Skip over current 0xff.

-    if (it[1] == 0xd9) {

-      return LIBYUV_TRUE;  // Success: Valid jpeg.

-    }

-    ++it;  // Skip over current 0xff.

   // ERROR: Invalid jpeg end code not found. Size sample_size

   return LIBYUV_FALSE;

@@ -69,20 +40,19 @@

 // Helper function to validate the jpeg appears intact.

 LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {

+  // Maximum size that ValidateJpeg will consider valid.

+  const size_t kMaxJpegSize = 0x7fffffffull;

   const size_t kBackSearchSize = 1024;

-  if (sample_size < 64) {

+  if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {

     // ERROR: Invalid jpeg size: sample_size

     return LIBYUV_FALSE;

-  if (sample[0] != 0xff || sample[1] != 0xd8) {  // Start Of Image

+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // SOI marker

     // ERROR: Invalid jpeg initial start code

     return LIBYUV_FALSE;

-  // Step over SOI marker.

-  sample += 2;

-  sample_size -= 2;

-  // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.

+  // Look for the End Of Image (EOI) marker near the end of the buffer.

   if (sample_size > kBackSearchSize) {

     if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {

       return LIBYUV_TRUE;  // Success: Valid jpeg.

@@ -90,8 +60,8 @@

     // Reduce search size for forward search.

     sample_size = sample_size - kBackSearchSize + 1;

-  return ScanEOI(sample, sample_size);

+  // Step over SOI marker and scan for EOI.

+  return ScanEOI(sample + 2, sample_size - 2);

 #ifdef __cplusplus

--- a/third_party/libyuv/source/planar_functions.cc

+++ b/third_party/libyuv/source/planar_functions.cc

@@ -17,6 +17,7 @@

 #include "libyuv/mjpeg_decoder.h"

 #endif

 #include "libyuv/row.h"

+#include "libyuv/scale_row.h"  // for ScaleRowDown2

 #ifdef __cplusplus

 namespace libyuv {

@@ -237,14 +238,6 @@

 #endif

-#if defined(HAS_MIRRORROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    MirrorRow = MirrorRow_Any_SSE2;

-    if (IS_ALIGNED(width, 16)) {

-      MirrorRow = MirrorRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_MIRRORROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     MirrorRow = MirrorRow_Any_SSSE3;

@@ -262,11 +255,11 @@

 #endif

 // TODO(fbarchard): Mirror on mips handle unaligned memory.

-#if defined(HAS_MIRRORROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+#if defined(HAS_MIRRORROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) &&

       IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

       IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {

-    MirrorRow = MirrorRow_MIPS_DSPR2;

+    MirrorRow = MirrorRow_DSPR2;

 #endif

@@ -287,9 +280,9 @@

                int width, int height) {

   int y;

   void (*YUY2ToUV422Row)(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int pix) =

+                         uint8* dst_u, uint8* dst_v, int width) =

       YUY2ToUV422Row_C;

-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =

+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =

       YUY2ToYRow_C;

   // Negative height means invert the image.

   if (height < 0) {

@@ -359,10 +352,10 @@

                int width, int height) {

   int y;

   void (*UYVYToUV422Row)(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int pix) =

+                         uint8* dst_u, uint8* dst_v, int width) =

       UYVYToUV422Row_C;

   void (*UYVYToYRow)(const uint8* src_uyvy,

-                     uint8* dst_y, int pix) = UYVYToYRow_C;

+                     uint8* dst_y, int width) = UYVYToYRow_C;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

@@ -541,11 +534,6 @@

     return ARGBBlendRow;

 #endif

-#if defined(HAS_ARGBBLENDROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    ARGBBlendRow = ARGBBlendRow_SSE2;

-  }

-#endif

 #if defined(HAS_ARGBBLENDROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     ARGBBlendRow = ARGBBlendRow_NEON;

@@ -590,6 +578,179 @@

   return 0;

+// Alpha Blend plane and store to destination.

+LIBYUV_API

+int BlendPlane(const uint8* src_y0, int src_stride_y0,

+               const uint8* src_y1, int src_stride_y1,

+               const uint8* alpha, int alpha_stride,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height) {

+  int y;

+  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,

+      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;

+  if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_y = dst_y + (height - 1) * dst_stride_y;

+    dst_stride_y = -dst_stride_y;

+  }

+  // Coalesce rows for Y plane.

+  if (src_stride_y0 == width &&

+      src_stride_y1 == width &&

+      alpha_stride == width &&

+      dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;

+  }

+#if defined(HAS_BLENDPLANEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+  BlendPlaneRow = BlendPlaneRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      BlendPlaneRow = BlendPlaneRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_BLENDPLANEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+  BlendPlaneRow = BlendPlaneRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      BlendPlaneRow = BlendPlaneRow_AVX2;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);

+    src_y0 += src_stride_y0;

+    src_y1 += src_stride_y1;

+    alpha += alpha_stride;

+    dst_y += dst_stride_y;

+  }

+  return 0;

+}

+#define MAXTWIDTH 2048

+// Alpha Blend YUV images and store to destination.

+LIBYUV_API

+int I420Blend(const uint8* src_y0, int src_stride_y0,

+              const uint8* src_u0, int src_stride_u0,

+              const uint8* src_v0, int src_stride_v0,

+              const uint8* src_y1, int src_stride_y1,

+              const uint8* src_u1, int src_stride_u1,

+              const uint8* src_v1, int src_stride_v1,

+              const uint8* alpha, int alpha_stride,

+              uint8* dst_y, int dst_stride_y,

+              uint8* dst_u, int dst_stride_u,

+              uint8* dst_v, int dst_stride_v,

+              int width, int height) {

+  int y;

+  // Half width/height for UV.

+  int halfwidth = (width + 1) >> 1;

+  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,

+      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;

+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;

+  if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||

+      !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_y = dst_y + (height - 1) * dst_stride_y;

+    dst_stride_y = -dst_stride_y;

+  }

+  // Blend Y plane.

+  BlendPlane(src_y0, src_stride_y0,

+             src_y1, src_stride_y1,

+             alpha, alpha_stride,

+             dst_y, dst_stride_y,

+             width, height);

+#if defined(HAS_BLENDPLANEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;

+    if (IS_ALIGNED(halfwidth, 8)) {

+      BlendPlaneRow = BlendPlaneRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_BLENDPLANEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;

+    if (IS_ALIGNED(halfwidth, 32)) {

+      BlendPlaneRow = BlendPlaneRow_AVX2;

+    }

+  }

+#endif

+  if (!IS_ALIGNED(width, 2)) {

+    ScaleRowDown2 = ScaleRowDown2Box_Odd_C;

+  }

+#if defined(HAS_SCALEROWDOWN2_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;

+    if (IS_ALIGNED(width, 2)) {

+      ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;

+      if (IS_ALIGNED(halfwidth, 16)) {

+        ScaleRowDown2 = ScaleRowDown2Box_NEON;

+      }

+    }

+  }

+#endif

+#if defined(HAS_SCALEROWDOWN2_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;

+    if (IS_ALIGNED(width, 2)) {

+      ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;

+      if (IS_ALIGNED(halfwidth, 16)) {

+        ScaleRowDown2 = ScaleRowDown2Box_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_SCALEROWDOWN2_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;

+    if (IS_ALIGNED(width, 2)) {

+      ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;

+      if (IS_ALIGNED(halfwidth, 32)) {

+        ScaleRowDown2 = ScaleRowDown2Box_AVX2;

+      }

+    }

+  }

+#endif

+  // Row buffer for intermediate alpha pixels.

+  align_buffer_64(halfalpha, halfwidth);

+  for (y = 0; y < height; y += 2) {

+    // last row of odd height image use 1 row of alpha instead of 2.

+    if (y == (height - 1)) {

+      alpha_stride = 0;

+    }

+    // Subsample 2 rows of UV to half width and half height.

+    ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);

+    alpha += alpha_stride * 2;

+    BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);

+    BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);

+    src_u0 += src_stride_u0;

+    src_u1 += src_stride_u1;

+    dst_u += dst_stride_u;

+    src_v0 += src_stride_v0;

+    src_v1 += src_stride_v1;

+    dst_v += dst_stride_v;

+  }

+  free_aligned_buffer_64(halfalpha);

+  return 0;

+}

 // Multiply 2 ARGB images and store to destination.

 LIBYUV_API

 int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,

@@ -777,22 +938,21 @@

   return 0;

-// Convert I422 to BGRA.

-LIBYUV_API

-int I422ToBGRA(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_bgra, int dst_stride_bgra,

-               int width, int height) {

+// Convert I422 to RGBA with matrix

+static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,

+                            const uint8* src_u, int src_stride_u,

+                            const uint8* src_v, int src_stride_v,

+                            uint8* dst_rgba, int dst_stride_rgba,

+                            const struct YuvConstants* yuvconstants,

+                            int width, int height) {

   int y;

-  void (*I422ToBGRARow)(const uint8* y_buf,

+  void (*I422ToRGBARow)(const uint8* y_buf,

                         const uint8* u_buf,

                         const uint8* v_buf,

                         uint8* rgb_buf,

-                        int width) = I422ToBGRARow_C;

-  if (!src_y || !src_u || !src_v ||

-      !dst_bgra ||

+                        const struct YuvConstants* yuvconstants,

+                        int width) = I422ToRGBARow_C;

+  if (!src_y || !src_u || !src_v || !dst_rgba ||

       width <= 0 || height == 0) {

     return -1;

@@ -799,55 +959,46 @@

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

-    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;

-    dst_stride_bgra = -dst_stride_bgra;

+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;

+    dst_stride_rgba = -dst_stride_rgba;

-  // Coalesce rows.

-  if (src_stride_y == width &&

-      src_stride_u * 2 == width &&

-      src_stride_v * 2 == width &&

-      dst_stride_bgra == width * 4) {

-    width *= height;

-    height = 1;

-    src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;

-  }

-#if defined(HAS_I422TOBGRAROW_SSSE3)

+#if defined(HAS_I422TORGBAROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;

+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;

     if (IS_ALIGNED(width, 8)) {

-      I422ToBGRARow = I422ToBGRARow_SSSE3;

+      I422ToRGBARow = I422ToRGBARow_SSSE3;

 #endif

-#if defined(HAS_I422TOBGRAROW_AVX2)

+#if defined(HAS_I422TORGBAROW_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

-    I422ToBGRARow = I422ToBGRARow_Any_AVX2;

+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;

     if (IS_ALIGNED(width, 16)) {

-      I422ToBGRARow = I422ToBGRARow_AVX2;

+      I422ToRGBARow = I422ToRGBARow_AVX2;

 #endif

-#if defined(HAS_I422TOBGRAROW_NEON)

+#if defined(HAS_I422TORGBAROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

-    I422ToBGRARow = I422ToBGRARow_Any_NEON;

+    I422ToRGBARow = I422ToRGBARow_Any_NEON;

     if (IS_ALIGNED(width, 8)) {

-      I422ToBGRARow = I422ToBGRARow_NEON;

+      I422ToRGBARow = I422ToRGBARow_NEON;

 #endif

-#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

+#if defined(HAS_I422TORGBAROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

       IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

       IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

       IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {

-    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;

+      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {

+    I422ToRGBARow = I422ToRGBARow_DSPR2;

 #endif

   for (y = 0; y < height; ++y) {

-    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);

-    dst_bgra += dst_stride_bgra;

+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);

+    dst_rgba += dst_stride_rgba;

     src_y += src_stride_y;

     src_u += src_stride_u;

     src_v += src_stride_v;

@@ -855,140 +1006,34 @@

   return 0;

-// Convert I422 to ABGR.

+// Convert I422 to RGBA.

 LIBYUV_API

-int I422ToABGR(const uint8* src_y, int src_stride_y,

+int I422ToRGBA(const uint8* src_y, int src_stride_y,

                const uint8* src_u, int src_stride_u,

                const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

+               uint8* dst_rgba, int dst_stride_rgba,

                int width, int height) {

-  int y;

-  void (*I422ToABGRRow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        int width) = I422ToABGRRow_C;

-  if (!src_y || !src_u || !src_v ||

-      !dst_abgr ||

-      width <= 0 || height == 0) {

-    return -1;

-  }

-  // Negative height means invert the image.

-  if (height < 0) {

-    height = -height;

-    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;

-    dst_stride_abgr = -dst_stride_abgr;

-  }

-  // Coalesce rows.

-  if (src_stride_y == width &&

-      src_stride_u * 2 == width &&

-      src_stride_v * 2 == width &&

-      dst_stride_abgr == width * 4) {

-    width *= height;

-    height = 1;

-    src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;

-  }

-#if defined(HAS_I422TOABGRROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

-    I422ToABGRRow = I422ToABGRRow_Any_NEON;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToABGRRow = I422ToABGRRow_NEON;

-    }

-  }

-#endif

-#if defined(HAS_I422TOABGRROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

-    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToABGRRow = I422ToABGRRow_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_I422TOABGRROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2)) {

-    I422ToABGRRow = I422ToABGRRow_Any_AVX2;

-    if (IS_ALIGNED(width, 16)) {

-      I422ToABGRRow = I422ToABGRRow_AVX2;

-    }

-  }

-#endif

-  for (y = 0; y < height; ++y) {

-    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);

-    dst_abgr += dst_stride_abgr;

-    src_y += src_stride_y;

-    src_u += src_stride_u;

-    src_v += src_stride_v;

-  }

-  return 0;

+  return I422ToRGBAMatrix(src_y, src_stride_y,

+                          src_u, src_stride_u,

+                          src_v, src_stride_v,

+                          dst_rgba, dst_stride_rgba,

+                          &kYuvI601Constants,

+                          width, height);

-// Convert I422 to RGBA.

+// Convert I422 to BGRA.

 LIBYUV_API

-int I422ToRGBA(const uint8* src_y, int src_stride_y,

+int I422ToBGRA(const uint8* src_y, int src_stride_y,

                const uint8* src_u, int src_stride_u,

                const uint8* src_v, int src_stride_v,

-               uint8* dst_rgba, int dst_stride_rgba,

+               uint8* dst_bgra, int dst_stride_bgra,

                int width, int height) {

-  int y;

-  void (*I422ToRGBARow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        int width) = I422ToRGBARow_C;

-  if (!src_y || !src_u || !src_v ||

-      !dst_rgba ||

-      width <= 0 || height == 0) {

-    return -1;

-  }

-  // Negative height means invert the image.

-  if (height < 0) {

-    height = -height;

-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;

-    dst_stride_rgba = -dst_stride_rgba;

-  }

-  // Coalesce rows.

-  if (src_stride_y == width &&

-      src_stride_u * 2 == width &&

-      src_stride_v * 2 == width &&

-      dst_stride_rgba == width * 4) {

-    width *= height;

-    height = 1;

-    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;

-  }

-#if defined(HAS_I422TORGBAROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

-    I422ToRGBARow = I422ToRGBARow_Any_NEON;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToRGBARow = I422ToRGBARow_NEON;

-    }

-  }

-#endif

-#if defined(HAS_I422TORGBAROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;

-    if (IS_ALIGNED(width, 8)) {

-      I422ToRGBARow = I422ToRGBARow_SSSE3;

-    }

-  }

-#endif

-#if defined(HAS_I422TORGBAROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2)) {

-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;

-    if (IS_ALIGNED(width, 16)) {

-      I422ToRGBARow = I422ToRGBARow_AVX2;

-    }

-  }

-#endif

-  for (y = 0; y < height; ++y) {

-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);

-    dst_rgba += dst_stride_rgba;

-    src_y += src_stride_y;

-    src_u += src_stride_u;

-    src_v += src_stride_v;

-  }

-  return 0;

+  return I422ToRGBAMatrix(src_y, src_stride_y,

+                          src_v, src_stride_v,  // Swap U and V

+                          src_u, src_stride_u,

+                          dst_bgra, dst_stride_bgra,

+                          &kYvuI601Constants,  // Use Yvu matrix

+                          width, height);

 // Convert NV12 to RGB565.

@@ -1001,6 +1046,7 @@

   void (*NV12ToRGB565Row)(const uint8* y_buf,

                           const uint8* uv_buf,

                           uint8* rgb_buf,

+                          const struct YuvConstants* yuvconstants,

                           int width) = NV12ToRGB565Row_C;

   if (!src_y || !src_uv || !dst_rgb565 ||

       width <= 0 || height == 0) {

@@ -1038,7 +1084,7 @@

 #endif

   for (y = 0; y < height; ++y) {

-    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);

+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);

     dst_rgb565 += dst_stride_rgb565;

     src_y += src_stride_y;

     if (y & 1) {

@@ -1048,18 +1094,15 @@

   return 0;

-// Convert NV21 to RGB565.

+// Convert RAW to RGB24.

 LIBYUV_API

-int NV21ToRGB565(const uint8* src_y, int src_stride_y,

-                 const uint8* src_vu, int src_stride_vu,

-                 uint8* dst_rgb565, int dst_stride_rgb565,

-                 int width, int height) {

+int RAWToRGB24(const uint8* src_raw, int src_stride_raw,

+               uint8* dst_rgb24, int dst_stride_rgb24,

+               int width, int height) {

   int y;

-  void (*NV21ToRGB565Row)(const uint8* y_buf,

-                          const uint8* src_vu,

-                          uint8* rgb_buf,

-                          int width) = NV21ToRGB565Row_C;

-  if (!src_y || !src_vu || !dst_rgb565 ||

+  void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =

+      RAWToRGB24Row_C;

+  if (!src_raw || !dst_rgb24 ||

       width <= 0 || height == 0) {

     return -1;

@@ -1066,41 +1109,37 @@

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;

-    dst_stride_rgb565 = -dst_stride_rgb565;

+    src_raw = src_raw + (height - 1) * src_stride_raw;

+    src_stride_raw = -src_stride_raw;

-#if defined(HAS_NV21TORGB565ROW_SSSE3)

+  // Coalesce rows.

+  if (src_stride_raw == width * 3 &&

+      dst_stride_rgb24 == width * 3) {

+    width *= height;

+    height = 1;

+    src_stride_raw = dst_stride_rgb24 = 0;

+  }

+#if defined(HAS_RAWTORGB24ROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;

+    RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;

     if (IS_ALIGNED(width, 8)) {

-      NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;

+      RAWToRGB24Row = RAWToRGB24Row_SSSE3;

 #endif

-#if defined(HAS_NV21TORGB565ROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2)) {

-    NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2;

-    if (IS_ALIGNED(width, 16)) {

-      NV21ToRGB565Row = NV21ToRGB565Row_AVX2;

-    }

-  }

-#endif

-#if defined(HAS_NV21TORGB565ROW_NEON)

+#if defined(HAS_RAWTORGB24ROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

-    NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;

+    RAWToRGB24Row = RAWToRGB24Row_Any_NEON;

     if (IS_ALIGNED(width, 8)) {

-      NV21ToRGB565Row = NV21ToRGB565Row_NEON;

+      RAWToRGB24Row = RAWToRGB24Row_NEON;

 #endif

   for (y = 0; y < height; ++y) {

-    NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);

-    dst_rgb565 += dst_stride_rgb565;

-    src_y += src_stride_y;

-    if (y & 1) {

-      src_vu += src_stride_vu;

-    }

+    RAWToRGB24Row(src_raw, dst_rgb24, width);

+    src_raw += src_stride_raw;

+    dst_rgb24 += dst_stride_rgb24;

   return 0;

@@ -1110,7 +1149,7 @@

               int width, int height,

               uint32 value) {

   int y;

-  void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;

+  void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;

   if (height < 0) {

     height = -height;

     dst_y = dst_y + (height - 1) * dst_stride_y;

@@ -1186,7 +1225,7 @@

              int width, int height,

              uint32 value) {

   int y;

-  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;

+  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;

   if (!dst_argb ||

       width <= 0 || height == 0 ||

       dst_x < 0 || dst_y < 0) {

@@ -1262,14 +1301,6 @@

     height = 1;

     src_stride_argb = dst_stride_argb = 0;

-#if defined(HAS_ARGBATTENUATEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;

-    if (IS_ALIGNED(width, 4)) {

-      ARGBAttenuateRow = ARGBAttenuateRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_ARGBATTENUATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;

@@ -1824,45 +1855,37 @@

   return 0;

-// Interpolate 2 ARGB images by specified amount (0 to 255).

+// Interpolate 2 planes by specified amount (0 to 255).

 LIBYUV_API

-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,

-                    const uint8* src_argb1, int src_stride_argb1,

-                    uint8* dst_argb, int dst_stride_argb,

-                    int width, int height, int interpolation) {

+int InterpolatePlane(const uint8* src0, int src_stride0,

+                     const uint8* src1, int src_stride1,

+                     uint8* dst, int dst_stride,

+                     int width, int height, int interpolation) {

   int y;

   void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

                          ptrdiff_t src_stride, int dst_width,

                          int source_y_fraction) = InterpolateRow_C;

-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {

+  if (!src0 || !src1 || !dst || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

-    dst_stride_argb = -dst_stride_argb;

+    dst = dst + (height - 1) * dst_stride;

+    dst_stride = -dst_stride;

   // Coalesce rows.

-  if (src_stride_argb0 == width * 4 &&

-      src_stride_argb1 == width * 4 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride0 == width &&

+      src_stride1 == width &&

+      dst_stride == width) {

     width *= height;

     height = 1;

-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;

+    src_stride0 = src_stride1 = dst_stride = 0;

-#if defined(HAS_INTERPOLATEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    InterpolateRow = InterpolateRow_Any_SSE2;

-    if (IS_ALIGNED(width, 4)) {

-      InterpolateRow = InterpolateRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

-    if (IS_ALIGNED(width, 4)) {

+    if (IS_ALIGNED(width, 16)) {

       InterpolateRow = InterpolateRow_SSSE3;

@@ -1870,7 +1893,7 @@

 #if defined(HAS_INTERPOLATEROW_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

     InterpolateRow = InterpolateRow_Any_AVX2;

-    if (IS_ALIGNED(width, 8)) {

+    if (IS_ALIGNED(width, 32)) {

       InterpolateRow = InterpolateRow_AVX2;

@@ -1878,30 +1901,77 @@

 #if defined(HAS_INTERPOLATEROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     InterpolateRow = InterpolateRow_Any_NEON;

-    if (IS_ALIGNED(width, 4)) {

+    if (IS_ALIGNED(width, 16)) {

       InterpolateRow = InterpolateRow_NEON;

 #endif

-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

-      IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&

-      IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    InterpolateRow = InterpolateRow_MIPS_DSPR2;

+#if defined(HAS_INTERPOLATEROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) &&

+      IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&

+      IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&

+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&

+      IS_ALIGNED(width, 4)) {

+    InterpolateRow = InterpolateRow_DSPR2;

 #endif

   for (y = 0; y < height; ++y) {

-    InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,

-                   width * 4, interpolation);

-    src_argb0 += src_stride_argb0;

-    src_argb1 += src_stride_argb1;

-    dst_argb += dst_stride_argb;

+    InterpolateRow(dst, src0, src1 - src0, width, interpolation);

+    src0 += src_stride0;

+    src1 += src_stride1;

+    dst += dst_stride;

   return 0;

+// Interpolate 2 ARGB images by specified amount (0 to 255).

+LIBYUV_API

+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,

+                    const uint8* src_argb1, int src_stride_argb1,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height, int interpolation) {

+  return InterpolatePlane(src_argb0, src_stride_argb0,

+                          src_argb1, src_stride_argb1,

+                          dst_argb, dst_stride_argb,

+                          width * 4, height, interpolation);

+}

+// Interpolate 2 YUV images by specified amount (0 to 255).

+LIBYUV_API

+int I420Interpolate(const uint8* src0_y, int src0_stride_y,

+                    const uint8* src0_u, int src0_stride_u,

+                    const uint8* src0_v, int src0_stride_v,

+                    const uint8* src1_y, int src1_stride_y,

+                    const uint8* src1_u, int src1_stride_u,

+                    const uint8* src1_v, int src1_stride_v,

+                    uint8* dst_y, int dst_stride_y,

+                    uint8* dst_u, int dst_stride_u,

+                    uint8* dst_v, int dst_stride_v,

+                    int width, int height, int interpolation) {

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src0_y || !src0_u || !src0_v ||

+      !src1_y || !src1_u || !src1_v ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  InterpolatePlane(src0_y, src0_stride_y,

+                   src1_y, src1_stride_y,

+                   dst_y, dst_stride_y,

+                   width, height, interpolation);

+  InterpolatePlane(src0_u, src0_stride_u,

+                   src1_u, src1_stride_u,

+                   dst_u, dst_stride_u,

+                   halfwidth, halfheight, interpolation);

+  InterpolatePlane(src0_v, src0_stride_v,

+                   src1_v, src1_stride_v,

+                   dst_v, dst_stride_v,

+                   halfwidth, halfheight, interpolation);

+  return 0;

+}

 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.

 LIBYUV_API

 int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,

@@ -1909,7 +1979,7 @@

                 const uint8* shuffler, int width, int height) {

   int y;

   void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,

-                         const uint8* shuffler, int pix) = ARGBShuffleRow_C;

+                         const uint8* shuffler, int width) = ARGBShuffleRow_C;

   if (!src_bgra || !dst_argb ||

       width <= 0 || height == 0) {

     return -1;

@@ -1976,7 +2046,7 @@

                                          const uint8* src_sobely,

                                          uint8* dst, int width)) {

   int y;

-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =

+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =

       ARGBToYJRow_C;

   void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,

                     uint8* dst_sobely, int width) = SobelYRow_C;

@@ -2280,13 +2350,19 @@

     src_stride_argb = dst_stride_argb = 0;

 #if defined(HAS_ARGBCOPYALPHAROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {

-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;

+    }

 #endif

 #if defined(HAS_ARGBCOPYALPHAROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {

-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;

+    }

 #endif

@@ -2298,6 +2374,49 @@

   return 0;

+// Extract just the alpha channel from ARGB.

+LIBYUV_API

+int ARGBExtractAlpha(const uint8* src_argb, int src_stride,

+                     uint8* dst_a, int dst_stride,

+                     int width, int height) {

+  if (!src_argb || !dst_a || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb += (height - 1) * src_stride;

+    src_stride = -src_stride;

+  }

+  // Coalesce rows.

+  if (src_stride == width * 4 && dst_stride == width) {

+    width *= height;

+    height = 1;

+    src_stride = dst_stride = 0;

+  }

+  void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) =

+      ARGBExtractAlphaRow_C;

+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2

+                                               : ARGBExtractAlphaRow_Any_SSE2;

+  }

+#endif

+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON

+                                                : ARGBExtractAlphaRow_Any_NEON;

+  }

+#endif

+  for (int y = 0; y < height; ++y) {

+    ARGBExtractAlphaRow(src_argb, dst_a, width);

+    src_argb += src_stride;

+    dst_a += dst_stride;

+  }

+  return 0;

+}

 // Copy a planar Y channel to the alpha channel of a destination ARGB image.

 LIBYUV_API

 int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,

@@ -2323,13 +2442,19 @@

     src_stride_y = dst_stride_argb = 0;

 #if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {

-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;

+    }

 #endif

 #if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {

-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;

+    }

 #endif

@@ -2341,6 +2466,9 @@

   return 0;

+// TODO(fbarchard): Consider if width is even Y channel can be split

+// directly. A SplitUVRow_Odd function could copy the remaining chroma.

 LIBYUV_API

 int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,

                uint8* dst_y, int dst_stride_y,

@@ -2348,8 +2476,8 @@

                int width, int height) {

   int y;

   int halfwidth = (width + 1) >> 1;

-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =

-      SplitUVRow_C;

+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width) = SplitUVRow_C;

   void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

                          ptrdiff_t src_stride, int dst_width,

                          int source_y_fraction) = InterpolateRow_C;

@@ -2388,14 +2516,6 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    InterpolateRow = InterpolateRow_Any_SSE2;

-    if (IS_ALIGNED(width, 16)) {

-      InterpolateRow = InterpolateRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -2423,15 +2543,16 @@

     int awidth = halfwidth * 2;

-    // 2 rows of uv

-    align_buffer_64(rows, awidth * 2);

+    // row of y and 2 rows of uv

+    align_buffer_64(rows, awidth * 3);

     for (y = 0; y < height - 1; y += 2) {

       // Split Y from UV.

-      SplitUVRow(src_yuy2, dst_y, rows, awidth);

-      SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,

-                 rows + awidth, awidth);

-      InterpolateRow(dst_uv, rows, awidth, awidth, 128);

+      SplitUVRow(src_yuy2, rows, rows + awidth, awidth);

+      memcpy(dst_y, rows, width);

+      SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);

+      memcpy(dst_y + dst_stride_y, rows, width);

+      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);

       src_yuy2 += src_stride_yuy2 * 2;

       dst_y += dst_stride_y * 2;

       dst_uv += dst_stride_uv;

@@ -2438,7 +2559,8 @@

     if (height & 1) {

       // Split Y from UV.

-      SplitUVRow(src_yuy2, dst_y, dst_uv, width);

+      SplitUVRow(src_yuy2, rows, dst_uv, awidth);

+      memcpy(dst_y, rows, width);

     free_aligned_buffer_64(rows);

@@ -2452,8 +2574,8 @@

                int width, int height) {

   int y;

   int halfwidth = (width + 1) >> 1;

-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =

-      SplitUVRow_C;

+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width) = SplitUVRow_C;

   void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

                          ptrdiff_t src_stride, int dst_width,

                          int source_y_fraction) = InterpolateRow_C;

@@ -2492,14 +2614,6 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    InterpolateRow = InterpolateRow_Any_SSE2;

-    if (IS_ALIGNED(width, 16)) {

-      InterpolateRow = InterpolateRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -2527,15 +2641,16 @@

     int awidth = halfwidth * 2;

-    // 2 rows of uv

-    align_buffer_64(rows, awidth * 2);

+    // row of y and 2 rows of uv

+    align_buffer_64(rows, awidth * 3);

     for (y = 0; y < height - 1; y += 2) {

       // Split Y from UV.

-      SplitUVRow(src_uyvy, rows, dst_y, awidth);

-      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,

-                 dst_y + dst_stride_y, awidth);

-      InterpolateRow(dst_uv, rows, awidth, awidth, 128);

+      SplitUVRow(src_uyvy, rows + awidth, rows, awidth);

+      memcpy(dst_y, rows, width);

+      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);

+      memcpy(dst_y + dst_stride_y, rows, width);

+      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);

       src_uyvy += src_stride_uyvy * 2;

       dst_y += dst_stride_y * 2;

       dst_uv += dst_stride_uv;

@@ -2542,7 +2657,8 @@

     if (height & 1) {

       // Split Y from UV.

-      SplitUVRow(src_uyvy, dst_y, dst_uv, width);

+      SplitUVRow(src_uyvy, dst_uv, rows, awidth);

+      memcpy(dst_y, rows, width);

     free_aligned_buffer_64(rows);

--- a/third_party/libyuv/source/rotate.cc

+++ b/third_party/libyuv/source/rotate.cc

@@ -49,13 +49,13 @@

 #endif

-#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {

+#if defined(HAS_TRANSPOSEWX8_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2)) {

     if (IS_ALIGNED(width, 4) &&

         IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

-      TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;

+      TransposeWx8 = TransposeWx8_Fast_DSPR2;

     } else {

-      TransposeWx8 = TransposeWx8_MIPS_DSPR2;

+      TransposeWx8 = TransposeWx8_DSPR2;

 #endif

@@ -117,14 +117,6 @@

 #endif

-#if defined(HAS_MIRRORROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    MirrorRow = MirrorRow_Any_SSE2;

-    if (IS_ALIGNED(width, 16)) {

-      MirrorRow = MirrorRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_MIRRORROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     MirrorRow = MirrorRow_Any_SSSE3;

@@ -142,11 +134,11 @@

 #endif

 // TODO(fbarchard): Mirror on mips handle unaligned memory.

-#if defined(HAS_MIRRORROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+#if defined(HAS_MIRRORROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) &&

       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&

       IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {

-    MirrorRow = MirrorRow_MIPS_DSPR2;

+    MirrorRow = MirrorRow_DSPR2;

 #endif

 #if defined(HAS_COPYROW_SSE2)

@@ -204,14 +196,17 @@

 #endif

 #if defined(HAS_TRANSPOSEUVWX8_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {

-    TransposeUVWx8 = TransposeUVWx8_SSE2;

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    TransposeUVWx8 = TransposeUVWx8_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      TransposeUVWx8 = TransposeUVWx8_SSE2;

+    }

 #endif

-#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&

+#if defined(HAS_TRANSPOSEUVWX8_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&

       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

-    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;

+    TransposeUVWx8 = TransposeUVWx8_DSPR2;

 #endif

@@ -272,22 +267,22 @@

                  uint8* dst_b, int dst_stride_b,

                  int width, int height) {

   int i;

-  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =

+  void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =

       MirrorUVRow_C;

 #if defined(HAS_MIRRORUVROW_NEON)

   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

-    MirrorRowUV = MirrorUVRow_NEON;

+    MirrorUVRow = MirrorUVRow_NEON;

 #endif

-#if defined(HAS_MIRRORROW_UV_SSSE3)

+#if defined(HAS_MIRRORUVROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {

-    MirrorRowUV = MirrorUVRow_SSSE3;

+    MirrorUVRow = MirrorUVRow_SSSE3;

 #endif

-#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+#if defined(HAS_MIRRORUVROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) &&

       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

-    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;

+    MirrorUVRow = MirrorUVRow_DSPR2;

 #endif

@@ -295,7 +290,7 @@

   dst_b += dst_stride_b * (height - 1);

   for (i = 0; i < height; ++i) {

-    MirrorRowUV(src, dst_a, dst_b, width);

+    MirrorUVRow(src, dst_a, dst_b, width);

     src += src_stride;

     dst_a -= dst_stride_a;

     dst_b -= dst_stride_b;

--- a/third_party/libyuv/source/rotate_any.cc

+++ b/third_party/libyuv/source/rotate_any.cc

@@ -18,7 +18,7 @@

 extern "C" {

 #endif

-#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK)                                 \

+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                         \

     void NAMEANY(const uint8* src, int src_stride,                             \

                  uint8* dst, int dst_stride, int width) {                      \

       int r = width & MASK;                                                    \

@@ -26,23 +26,48 @@

       if (n > 0) {                                                             \

         TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \

       }                                                                        \

-      TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);        \

+      TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\

 #ifdef HAS_TRANSPOSEWX8_NEON

-TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)

+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)

 #endif

 #ifdef HAS_TRANSPOSEWX8_SSSE3

-TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)

+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)

 #endif

 #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3

-TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)

+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)

 #endif

-#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2

-TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)

+#ifdef HAS_TRANSPOSEWX8_DSPR2

+TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)

 #endif

 #undef TANY

+#define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \

+    void NAMEANY(const uint8* src, int src_stride,                             \

+                uint8* dst_a, int dst_stride_a,                                \

+                uint8* dst_b, int dst_stride_b, int width) {                   \

+      int r = width & MASK;                                                    \

+      int n = width - r;                                                       \

+      if (n > 0) {                                                             \

+        TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,   \

+                  n);                                                          \

+      }                                                                        \

+      TransposeUVWx8_C(src + n * 2, src_stride,                                \

+                       dst_a + n * dst_stride_a, dst_stride_a,                 \

+                       dst_b + n * dst_stride_b, dst_stride_b, r);             \

+    }

+#ifdef HAS_TRANSPOSEUVWX8_NEON

+TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)

+#endif

+#ifdef HAS_TRANSPOSEUVWX8_SSE2

+TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)

+#endif

+#ifdef HAS_TRANSPOSEUVWX8_DSPR2

+TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)

+#endif

+#undef TUVANY

 #ifdef __cplusplus

 }  // extern "C"

--- a/third_party/libyuv/source/rotate_gcc.cc

+++ b/third_party/libyuv/source/rotate_gcc.cc

@@ -17,16 +17,17 @@

 #endif

 // This module is for GCC x86 and x64.

-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))

 #if !defined(LIBYUV_DISABLE_X86) && \

-    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))

+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))

+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.

+#if defined(HAS_TRANSPOSEWX8_SSSE3)

 void TransposeWx8_SSSE3(const uint8* src, int src_stride,

                         uint8* dst, int dst_stride, int width) {

   asm volatile (

     // Read in the data from the source pointer.

     // First round of bit swap.

-    ".p2align  2                                 \n"

+    LABELALIGN

   "1:                                            \n"

     "movq       (%0),%%xmm0                      \n"

     "movq       (%0,%3),%%xmm1                   \n"

@@ -105,386 +106,260 @@

       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

);

+#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)

-#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)

-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

-                         uint8* dst_a, int dst_stride_a,

-                         uint8* dst_b, int dst_stride_b, int width);

-  asm (

-    DECLARE_FUNCTION(TransposeUVWx8_SSE2)

-    "push   %ebx                               \n"

-    "push   %esi                               \n"

-    "push   %edi                               \n"

-    "push   %ebp                               \n"

-    "mov    0x14(%esp),%eax                    \n"

-    "mov    0x18(%esp),%edi                    \n"

-    "mov    0x1c(%esp),%edx                    \n"

-    "mov    0x20(%esp),%esi                    \n"

-    "mov    0x24(%esp),%ebx                    \n"

-    "mov    0x28(%esp),%ebp                    \n"

-    "mov    %esp,%ecx                          \n"

-    "sub    $0x14,%esp                         \n"

-    "and    $0xfffffff0,%esp                   \n"

-    "mov    %ecx,0x10(%esp)                    \n"

-    "mov    0x2c(%ecx),%ecx                    \n"

-"1:                                            \n"

-    "movdqu (%eax),%xmm0                       \n"

-    "movdqu (%eax,%edi,1),%xmm1                \n"

-    "lea    (%eax,%edi,2),%eax                 \n"

-    "movdqa %xmm0,%xmm7                        \n"

-    "punpcklbw %xmm1,%xmm0                     \n"

-    "punpckhbw %xmm1,%xmm7                     \n"

-    "movdqa %xmm7,%xmm1                        \n"

-    "movdqu (%eax),%xmm2                       \n"

-    "movdqu (%eax,%edi,1),%xmm3                \n"

-    "lea    (%eax,%edi,2),%eax                 \n"

-    "movdqa %xmm2,%xmm7                        \n"

-    "punpcklbw %xmm3,%xmm2                     \n"

-    "punpckhbw %xmm3,%xmm7                     \n"

-    "movdqa %xmm7,%xmm3                        \n"

-    "movdqu (%eax),%xmm4                       \n"

-    "movdqu (%eax,%edi,1),%xmm5                \n"

-    "lea    (%eax,%edi,2),%eax                 \n"

-    "movdqa %xmm4,%xmm7                        \n"

-    "punpcklbw %xmm5,%xmm4                     \n"

-    "punpckhbw %xmm5,%xmm7                     \n"

-    "movdqa %xmm7,%xmm5                        \n"

-    "movdqu (%eax),%xmm6                       \n"

-    "movdqu (%eax,%edi,1),%xmm7                \n"

-    "lea    (%eax,%edi,2),%eax                 \n"

-    "movdqu %xmm5,(%esp)                       \n"

-    "neg    %edi                               \n"

-    "movdqa %xmm6,%xmm5                        \n"

-    "punpcklbw %xmm7,%xmm6                     \n"

-    "punpckhbw %xmm7,%xmm5                     \n"

-    "movdqa %xmm5,%xmm7                        \n"

-    "lea    0x10(%eax,%edi,8),%eax             \n"

-    "neg    %edi                               \n"

-    "movdqa %xmm0,%xmm5                        \n"

-    "punpcklwd %xmm2,%xmm0                     \n"

-    "punpckhwd %xmm2,%xmm5                     \n"

-    "movdqa %xmm5,%xmm2                        \n"

-    "movdqa %xmm1,%xmm5                        \n"

-    "punpcklwd %xmm3,%xmm1                     \n"

-    "punpckhwd %xmm3,%xmm5                     \n"

-    "movdqa %xmm5,%xmm3                        \n"

-    "movdqa %xmm4,%xmm5                        \n"

-    "punpcklwd %xmm6,%xmm4                     \n"

-    "punpckhwd %xmm6,%xmm5                     \n"

-    "movdqa %xmm5,%xmm6                        \n"

-    "movdqu (%esp),%xmm5                       \n"

-    "movdqu %xmm6,(%esp)                       \n"

-    "movdqa %xmm5,%xmm6                        \n"

-    "punpcklwd %xmm7,%xmm5                     \n"

-    "punpckhwd %xmm7,%xmm6                     \n"

-    "movdqa %xmm6,%xmm7                        \n"

-    "movdqa %xmm0,%xmm6                        \n"

-    "punpckldq %xmm4,%xmm0                     \n"

-    "punpckhdq %xmm4,%xmm6                     \n"

-    "movdqa %xmm6,%xmm4                        \n"

-    "movdqu (%esp),%xmm6                       \n"

-    "movlpd %xmm0,(%edx)                       \n"

-    "movhpd %xmm0,(%ebx)                       \n"

-    "movlpd %xmm4,(%edx,%esi,1)                \n"

-    "lea    (%edx,%esi,2),%edx                 \n"

-    "movhpd %xmm4,(%ebx,%ebp,1)                \n"

-    "lea    (%ebx,%ebp,2),%ebx                 \n"

-    "movdqa %xmm2,%xmm0                        \n"

-    "punpckldq %xmm6,%xmm2                     \n"

-    "movlpd %xmm2,(%edx)                       \n"

-    "movhpd %xmm2,(%ebx)                       \n"

-    "punpckhdq %xmm6,%xmm0                     \n"

-    "movlpd %xmm0,(%edx,%esi,1)                \n"

-    "lea    (%edx,%esi,2),%edx                 \n"

-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"

-    "lea    (%ebx,%ebp,2),%ebx                 \n"

-    "movdqa %xmm1,%xmm0                        \n"

-    "punpckldq %xmm5,%xmm1                     \n"

-    "movlpd %xmm1,(%edx)                       \n"

-    "movhpd %xmm1,(%ebx)                       \n"

-    "punpckhdq %xmm5,%xmm0                     \n"

-    "movlpd %xmm0,(%edx,%esi,1)                \n"

-    "lea    (%edx,%esi,2),%edx                 \n"

-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"

-    "lea    (%ebx,%ebp,2),%ebx                 \n"

-    "movdqa %xmm3,%xmm0                        \n"

-    "punpckldq %xmm7,%xmm3                     \n"

-    "movlpd %xmm3,(%edx)                       \n"

-    "movhpd %xmm3,(%ebx)                       \n"

-    "punpckhdq %xmm7,%xmm0                     \n"

-    "sub    $0x8,%ecx                          \n"

-    "movlpd %xmm0,(%edx,%esi,1)                \n"

-    "lea    (%edx,%esi,2),%edx                 \n"

-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"

-    "lea    (%ebx,%ebp,2),%ebx                 \n"

-    "jg     1b                                 \n"

-    "mov    0x10(%esp),%esp                    \n"

-    "pop    %ebp                               \n"

-    "pop    %edi                               \n"

-    "pop    %esi                               \n"

-    "pop    %ebx                               \n"

-#if defined(__native_client__)

-    "pop    %ecx                               \n"

-    "and    $0xffffffe0,%ecx                   \n"

-    "jmp    *%ecx                              \n"

-#else

-    "ret                                       \n"

-#endif

-);

-#endif

-#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \

-    defined(__x86_64__)

-// 64 bit version has enough registers to do 16x8 to 8x16 at a time.

+// Transpose 16x8. 64 bit

+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)

 void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,

                              uint8* dst, int dst_stride, int width) {

   asm volatile (

-  // Read in the data from the source pointer.

-  // First round of bit swap.

-  ".p2align  2                                 \n"

-"1:                                            \n"

-  "movdqu     (%0),%%xmm0                      \n"

-  "movdqu     (%0,%3),%%xmm1                   \n"

-  "lea        (%0,%3,2),%0                     \n"

-  "movdqa     %%xmm0,%%xmm8                    \n"

-  "punpcklbw  %%xmm1,%%xmm0                    \n"

-  "punpckhbw  %%xmm1,%%xmm8                    \n"

-  "movdqu     (%0),%%xmm2                      \n"

-  "movdqa     %%xmm0,%%xmm1                    \n"

-  "movdqa     %%xmm8,%%xmm9                    \n"

-  "palignr    $0x8,%%xmm1,%%xmm1               \n"

-  "palignr    $0x8,%%xmm9,%%xmm9               \n"

-  "movdqu     (%0,%3),%%xmm3                   \n"

-  "lea        (%0,%3,2),%0                     \n"

-  "movdqa     %%xmm2,%%xmm10                   \n"

-  "punpcklbw  %%xmm3,%%xmm2                    \n"

-  "punpckhbw  %%xmm3,%%xmm10                   \n"

-  "movdqa     %%xmm2,%%xmm3                    \n"

-  "movdqa     %%xmm10,%%xmm11                  \n"

-  "movdqu     (%0),%%xmm4                      \n"

-  "palignr    $0x8,%%xmm3,%%xmm3               \n"

-  "palignr    $0x8,%%xmm11,%%xmm11             \n"

-  "movdqu     (%0,%3),%%xmm5                   \n"

-  "lea        (%0,%3,2),%0                     \n"

-  "movdqa     %%xmm4,%%xmm12                   \n"

-  "punpcklbw  %%xmm5,%%xmm4                    \n"

-  "punpckhbw  %%xmm5,%%xmm12                   \n"

-  "movdqa     %%xmm4,%%xmm5                    \n"

-  "movdqa     %%xmm12,%%xmm13                  \n"

-  "movdqu     (%0),%%xmm6                      \n"

-  "palignr    $0x8,%%xmm5,%%xmm5               \n"

-  "palignr    $0x8,%%xmm13,%%xmm13             \n"

-  "movdqu     (%0,%3),%%xmm7                   \n"

-  "lea        (%0,%3,2),%0                     \n"

-  "movdqa     %%xmm6,%%xmm14                   \n"

-  "punpcklbw  %%xmm7,%%xmm6                    \n"

-  "punpckhbw  %%xmm7,%%xmm14                   \n"

-  "neg        %3                               \n"

-  "movdqa     %%xmm6,%%xmm7                    \n"

-  "movdqa     %%xmm14,%%xmm15                  \n"

-  "lea        0x10(%0,%3,8),%0                 \n"

-  "palignr    $0x8,%%xmm7,%%xmm7               \n"

-  "palignr    $0x8,%%xmm15,%%xmm15             \n"

-  "neg        %3                               \n"

-   // Second round of bit swap.

-  "punpcklwd  %%xmm2,%%xmm0                    \n"

-  "punpcklwd  %%xmm3,%%xmm1                    \n"

-  "movdqa     %%xmm0,%%xmm2                    \n"

-  "movdqa     %%xmm1,%%xmm3                    \n"

-  "palignr    $0x8,%%xmm2,%%xmm2               \n"

-  "palignr    $0x8,%%xmm3,%%xmm3               \n"

-  "punpcklwd  %%xmm6,%%xmm4                    \n"

-  "punpcklwd  %%xmm7,%%xmm5                    \n"

-  "movdqa     %%xmm4,%%xmm6                    \n"

-  "movdqa     %%xmm5,%%xmm7                    \n"

-  "palignr    $0x8,%%xmm6,%%xmm6               \n"

-  "palignr    $0x8,%%xmm7,%%xmm7               \n"

-  "punpcklwd  %%xmm10,%%xmm8                   \n"

-  "punpcklwd  %%xmm11,%%xmm9                   \n"

-  "movdqa     %%xmm8,%%xmm10                   \n"

-  "movdqa     %%xmm9,%%xmm11                   \n"

-  "palignr    $0x8,%%xmm10,%%xmm10             \n"

-  "palignr    $0x8,%%xmm11,%%xmm11             \n"

-  "punpcklwd  %%xmm14,%%xmm12                  \n"

-  "punpcklwd  %%xmm15,%%xmm13                  \n"

-  "movdqa     %%xmm12,%%xmm14                  \n"

-  "movdqa     %%xmm13,%%xmm15                  \n"

-  "palignr    $0x8,%%xmm14,%%xmm14             \n"

-  "palignr    $0x8,%%xmm15,%%xmm15             \n"

-  // Third round of bit swap.

-  // Write to the destination pointer.

-  "punpckldq  %%xmm4,%%xmm0                    \n"

-  "movq       %%xmm0,(%1)                      \n"

-  "movdqa     %%xmm0,%%xmm4                    \n"

-  "palignr    $0x8,%%xmm4,%%xmm4               \n"

-  "movq       %%xmm4,(%1,%4)                   \n"

-  "lea        (%1,%4,2),%1                     \n"

-  "punpckldq  %%xmm6,%%xmm2                    \n"

-  "movdqa     %%xmm2,%%xmm6                    \n"

-  "movq       %%xmm2,(%1)                      \n"

-  "palignr    $0x8,%%xmm6,%%xmm6               \n"

-  "punpckldq  %%xmm5,%%xmm1                    \n"

-  "movq       %%xmm6,(%1,%4)                   \n"

-  "lea        (%1,%4,2),%1                     \n"

-  "movdqa     %%xmm1,%%xmm5                    \n"

-  "movq       %%xmm1,(%1)                      \n"

-  "palignr    $0x8,%%xmm5,%%xmm5               \n"

-  "movq       %%xmm5,(%1,%4)                   \n"

-  "lea        (%1,%4,2),%1                     \n"

-  "punpckldq  %%xmm7,%%xmm3                    \n"

-  "movq       %%xmm3,(%1)                      \n"

-  "movdqa     %%xmm3,%%xmm7                    \n"

-  "palignr    $0x8,%%xmm7,%%xmm7               \n"

-  "movq       %%xmm7,(%1,%4)                   \n"

-  "lea        (%1,%4,2),%1                     \n"

-  "punpckldq  %%xmm12,%%xmm8                   \n"

-  "movq       %%xmm8,(%1)                      \n"

-  "movdqa     %%xmm8,%%xmm12                   \n"

-  "palignr    $0x8,%%xmm12,%%xmm12             \n"

-  "movq       %%xmm12,(%1,%4)                  \n"

-  "lea        (%1,%4,2),%1                     \n"

-  "punpckldq  %%xmm14,%%xmm10                  \n"

-  "movdqa     %%xmm10,%%xmm14                  \n"

-  "movq       %%xmm10,(%1)                     \n"

-  "palignr    $0x8,%%xmm14,%%xmm14             \n"

-  "punpckldq  %%xmm13,%%xmm9                   \n"

-  "movq       %%xmm14,(%1,%4)                  \n"

-  "lea        (%1,%4,2),%1                     \n"

-  "movdqa     %%xmm9,%%xmm13                   \n"

-  "movq       %%xmm9,(%1)                      \n"

-  "palignr    $0x8,%%xmm13,%%xmm13             \n"

-  "movq       %%xmm13,(%1,%4)                  \n"

-  "lea        (%1,%4,2),%1                     \n"

-  "punpckldq  %%xmm15,%%xmm11                  \n"

-  "movq       %%xmm11,(%1)                     \n"

-  "movdqa     %%xmm11,%%xmm15                  \n"

-  "palignr    $0x8,%%xmm15,%%xmm15             \n"

-  "sub        $0x10,%2                         \n"

-  "movq       %%xmm15,(%1,%4)                  \n"

-  "lea        (%1,%4,2),%1                     \n"

-  "jg         1b                               \n"

-  : "+r"(src),    // %0

-    "+r"(dst),    // %1

-    "+r"(width)   // %2

-  : "r"((intptr_t)(src_stride)),  // %3

-    "r"((intptr_t)(dst_stride))   // %4

-  : "memory", "cc",

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",

-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"

-);

+    // Read in the data from the source pointer.

+    // First round of bit swap.

+    LABELALIGN

+  "1:                                            \n"

+    "movdqu     (%0),%%xmm0                      \n"

+    "movdqu     (%0,%3),%%xmm1                   \n"

+    "lea        (%0,%3,2),%0                     \n"

+    "movdqa     %%xmm0,%%xmm8                    \n"

+    "punpcklbw  %%xmm1,%%xmm0                    \n"

+    "punpckhbw  %%xmm1,%%xmm8                    \n"

+    "movdqu     (%0),%%xmm2                      \n"

+    "movdqa     %%xmm0,%%xmm1                    \n"

+    "movdqa     %%xmm8,%%xmm9                    \n"

+    "palignr    $0x8,%%xmm1,%%xmm1               \n"

+    "palignr    $0x8,%%xmm9,%%xmm9               \n"

+    "movdqu     (%0,%3),%%xmm3                   \n"

+    "lea        (%0,%3,2),%0                     \n"

+    "movdqa     %%xmm2,%%xmm10                   \n"

+    "punpcklbw  %%xmm3,%%xmm2                    \n"

+    "punpckhbw  %%xmm3,%%xmm10                   \n"

+    "movdqa     %%xmm2,%%xmm3                    \n"

+    "movdqa     %%xmm10,%%xmm11                  \n"

+    "movdqu     (%0),%%xmm4                      \n"

+    "palignr    $0x8,%%xmm3,%%xmm3               \n"

+    "palignr    $0x8,%%xmm11,%%xmm11             \n"

+    "movdqu     (%0,%3),%%xmm5                   \n"

+    "lea        (%0,%3,2),%0                     \n"

+    "movdqa     %%xmm4,%%xmm12                   \n"

+    "punpcklbw  %%xmm5,%%xmm4                    \n"

+    "punpckhbw  %%xmm5,%%xmm12                   \n"

+    "movdqa     %%xmm4,%%xmm5                    \n"

+    "movdqa     %%xmm12,%%xmm13                  \n"

+    "movdqu     (%0),%%xmm6                      \n"

+    "palignr    $0x8,%%xmm5,%%xmm5               \n"

+    "palignr    $0x8,%%xmm13,%%xmm13             \n"

+    "movdqu     (%0,%3),%%xmm7                   \n"

+    "lea        (%0,%3,2),%0                     \n"

+    "movdqa     %%xmm6,%%xmm14                   \n"

+    "punpcklbw  %%xmm7,%%xmm6                    \n"

+    "punpckhbw  %%xmm7,%%xmm14                   \n"

+    "neg        %3                               \n"

+    "movdqa     %%xmm6,%%xmm7                    \n"

+    "movdqa     %%xmm14,%%xmm15                  \n"

+    "lea        0x10(%0,%3,8),%0                 \n"

+    "palignr    $0x8,%%xmm7,%%xmm7               \n"

+    "palignr    $0x8,%%xmm15,%%xmm15             \n"

+    "neg        %3                               \n"

+     // Second round of bit swap.

+    "punpcklwd  %%xmm2,%%xmm0                    \n"

+    "punpcklwd  %%xmm3,%%xmm1                    \n"

+    "movdqa     %%xmm0,%%xmm2                    \n"

+    "movdqa     %%xmm1,%%xmm3                    \n"

+    "palignr    $0x8,%%xmm2,%%xmm2               \n"

+    "palignr    $0x8,%%xmm3,%%xmm3               \n"

+    "punpcklwd  %%xmm6,%%xmm4                    \n"

+    "punpcklwd  %%xmm7,%%xmm5                    \n"

+    "movdqa     %%xmm4,%%xmm6                    \n"

+    "movdqa     %%xmm5,%%xmm7                    \n"

+    "palignr    $0x8,%%xmm6,%%xmm6               \n"

+    "palignr    $0x8,%%xmm7,%%xmm7               \n"

+    "punpcklwd  %%xmm10,%%xmm8                   \n"

+    "punpcklwd  %%xmm11,%%xmm9                   \n"

+    "movdqa     %%xmm8,%%xmm10                   \n"

+    "movdqa     %%xmm9,%%xmm11                   \n"

+    "palignr    $0x8,%%xmm10,%%xmm10             \n"

+    "palignr    $0x8,%%xmm11,%%xmm11             \n"

+    "punpcklwd  %%xmm14,%%xmm12                  \n"

+    "punpcklwd  %%xmm15,%%xmm13                  \n"

+    "movdqa     %%xmm12,%%xmm14                  \n"

+    "movdqa     %%xmm13,%%xmm15                  \n"

+    "palignr    $0x8,%%xmm14,%%xmm14             \n"

+    "palignr    $0x8,%%xmm15,%%xmm15             \n"

+    // Third round of bit swap.

+    // Write to the destination pointer.

+    "punpckldq  %%xmm4,%%xmm0                    \n"

+    "movq       %%xmm0,(%1)                      \n"

+    "movdqa     %%xmm0,%%xmm4                    \n"

+    "palignr    $0x8,%%xmm4,%%xmm4               \n"

+    "movq       %%xmm4,(%1,%4)                   \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "punpckldq  %%xmm6,%%xmm2                    \n"

+    "movdqa     %%xmm2,%%xmm6                    \n"

+    "movq       %%xmm2,(%1)                      \n"

+    "palignr    $0x8,%%xmm6,%%xmm6               \n"

+    "punpckldq  %%xmm5,%%xmm1                    \n"

+    "movq       %%xmm6,(%1,%4)                   \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "movdqa     %%xmm1,%%xmm5                    \n"

+    "movq       %%xmm1,(%1)                      \n"

+    "palignr    $0x8,%%xmm5,%%xmm5               \n"

+    "movq       %%xmm5,(%1,%4)                   \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "punpckldq  %%xmm7,%%xmm3                    \n"

+    "movq       %%xmm3,(%1)                      \n"

+    "movdqa     %%xmm3,%%xmm7                    \n"

+    "palignr    $0x8,%%xmm7,%%xmm7               \n"

+    "movq       %%xmm7,(%1,%4)                   \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "punpckldq  %%xmm12,%%xmm8                   \n"

+    "movq       %%xmm8,(%1)                      \n"

+    "movdqa     %%xmm8,%%xmm12                   \n"

+    "palignr    $0x8,%%xmm12,%%xmm12             \n"

+    "movq       %%xmm12,(%1,%4)                  \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "punpckldq  %%xmm14,%%xmm10                  \n"

+    "movdqa     %%xmm10,%%xmm14                  \n"

+    "movq       %%xmm10,(%1)                     \n"

+    "palignr    $0x8,%%xmm14,%%xmm14             \n"

+    "punpckldq  %%xmm13,%%xmm9                   \n"

+    "movq       %%xmm14,(%1,%4)                  \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "movdqa     %%xmm9,%%xmm13                   \n"

+    "movq       %%xmm9,(%1)                      \n"

+    "palignr    $0x8,%%xmm13,%%xmm13             \n"

+    "movq       %%xmm13,(%1,%4)                  \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "punpckldq  %%xmm15,%%xmm11                  \n"

+    "movq       %%xmm11,(%1)                     \n"

+    "movdqa     %%xmm11,%%xmm15                  \n"

+    "palignr    $0x8,%%xmm15,%%xmm15             \n"

+    "sub        $0x10,%2                         \n"

+    "movq       %%xmm15,(%1,%4)                  \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "jg         1b                               \n"

+    : "+r"(src),    // %0

+      "+r"(dst),    // %1

+      "+r"(width)   // %2

+    : "r"((intptr_t)(src_stride)),  // %3

+      "r"((intptr_t)(dst_stride))   // %4

+    : "memory", "cc",

+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",

+      "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"

+  );

+#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)

+// Transpose UV 8x8.  64 bit.

+#if defined(HAS_TRANSPOSEUVWX8_SSE2)

 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

                          uint8* dst_a, int dst_stride_a,

                          uint8* dst_b, int dst_stride_b, int width) {

   asm volatile (

-  // Read in the data from the source pointer.

-  // First round of bit swap.

-  ".p2align  2                                 \n"

-"1:                                            \n"

-  "movdqu     (%0),%%xmm0                      \n"

-  "movdqu     (%0,%4),%%xmm1                   \n"

-  "lea        (%0,%4,2),%0                     \n"

-  "movdqa     %%xmm0,%%xmm8                    \n"

-  "punpcklbw  %%xmm1,%%xmm0                    \n"

-  "punpckhbw  %%xmm1,%%xmm8                    \n"

-  "movdqa     %%xmm8,%%xmm1                    \n"

-  "movdqu     (%0),%%xmm2                      \n"

-  "movdqu     (%0,%4),%%xmm3                   \n"

-  "lea        (%0,%4,2),%0                     \n"

-  "movdqa     %%xmm2,%%xmm8                    \n"

-  "punpcklbw  %%xmm3,%%xmm2                    \n"

-  "punpckhbw  %%xmm3,%%xmm8                    \n"

-  "movdqa     %%xmm8,%%xmm3                    \n"

-  "movdqu     (%0),%%xmm4                      \n"

-  "movdqu     (%0,%4),%%xmm5                   \n"

-  "lea        (%0,%4,2),%0                     \n"

-  "movdqa     %%xmm4,%%xmm8                    \n"

-  "punpcklbw  %%xmm5,%%xmm4                    \n"

-  "punpckhbw  %%xmm5,%%xmm8                    \n"

-  "movdqa     %%xmm8,%%xmm5                    \n"

-  "movdqu     (%0),%%xmm6                      \n"

-  "movdqu     (%0,%4),%%xmm7                   \n"

-  "lea        (%0,%4,2),%0                     \n"

-  "movdqa     %%xmm6,%%xmm8                    \n"

-  "punpcklbw  %%xmm7,%%xmm6                    \n"

-  "neg        %4                               \n"

-  "lea        0x10(%0,%4,8),%0                 \n"

-  "punpckhbw  %%xmm7,%%xmm8                    \n"

-  "movdqa     %%xmm8,%%xmm7                    \n"

-  "neg        %4                               \n"

-   // Second round of bit swap.

-  "movdqa     %%xmm0,%%xmm8                    \n"

-  "movdqa     %%xmm1,%%xmm9                    \n"

-  "punpckhwd  %%xmm2,%%xmm8                    \n"

-  "punpckhwd  %%xmm3,%%xmm9                    \n"

-  "punpcklwd  %%xmm2,%%xmm0                    \n"

-  "punpcklwd  %%xmm3,%%xmm1                    \n"

-  "movdqa     %%xmm8,%%xmm2                    \n"

-  "movdqa     %%xmm9,%%xmm3                    \n"

-  "movdqa     %%xmm4,%%xmm8                    \n"

-  "movdqa     %%xmm5,%%xmm9                    \n"

-  "punpckhwd  %%xmm6,%%xmm8                    \n"

-  "punpckhwd  %%xmm7,%%xmm9                    \n"

-  "punpcklwd  %%xmm6,%%xmm4                    \n"

-  "punpcklwd  %%xmm7,%%xmm5                    \n"

-  "movdqa     %%xmm8,%%xmm6                    \n"

-  "movdqa     %%xmm9,%%xmm7                    \n"

-  // Third round of bit swap.

-  // Write to the destination pointer.

-  "movdqa     %%xmm0,%%xmm8                    \n"

-  "punpckldq  %%xmm4,%%xmm0                    \n"

-  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel

-  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel

-  "punpckhdq  %%xmm4,%%xmm8                    \n"

-  "movlpd     %%xmm8,(%1,%5)                   \n"

-  "lea        (%1,%5,2),%1                     \n"

-  "movhpd     %%xmm8,(%2,%6)                   \n"

-  "lea        (%2,%6,2),%2                     \n"

-  "movdqa     %%xmm2,%%xmm8                    \n"

-  "punpckldq  %%xmm6,%%xmm2                    \n"

-  "movlpd     %%xmm2,(%1)                      \n"

-  "movhpd     %%xmm2,(%2)                      \n"

-  "punpckhdq  %%xmm6,%%xmm8                    \n"

-  "movlpd     %%xmm8,(%1,%5)                   \n"

-  "lea        (%1,%5,2),%1                     \n"

-  "movhpd     %%xmm8,(%2,%6)                   \n"

-  "lea        (%2,%6,2),%2                     \n"

-  "movdqa     %%xmm1,%%xmm8                    \n"

-  "punpckldq  %%xmm5,%%xmm1                    \n"

-  "movlpd     %%xmm1,(%1)                      \n"

-  "movhpd     %%xmm1,(%2)                      \n"

-  "punpckhdq  %%xmm5,%%xmm8                    \n"

-  "movlpd     %%xmm8,(%1,%5)                   \n"

-  "lea        (%1,%5,2),%1                     \n"

-  "movhpd     %%xmm8,(%2,%6)                   \n"

-  "lea        (%2,%6,2),%2                     \n"

-  "movdqa     %%xmm3,%%xmm8                    \n"

-  "punpckldq  %%xmm7,%%xmm3                    \n"

-  "movlpd     %%xmm3,(%1)                      \n"

-  "movhpd     %%xmm3,(%2)                      \n"

-  "punpckhdq  %%xmm7,%%xmm8                    \n"

-  "sub        $0x8,%3                          \n"

-  "movlpd     %%xmm8,(%1,%5)                   \n"

-  "lea        (%1,%5,2),%1                     \n"

-  "movhpd     %%xmm8,(%2,%6)                   \n"

-  "lea        (%2,%6,2),%2                     \n"

-  "jg         1b                               \n"

-  : "+r"(src),    // %0

-    "+r"(dst_a),  // %1

-    "+r"(dst_b),  // %2

-    "+r"(width)   // %3

-  : "r"((intptr_t)(src_stride)),    // %4

-    "r"((intptr_t)(dst_stride_a)),  // %5

-    "r"((intptr_t)(dst_stride_b))   // %6

-  : "memory", "cc",

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",

-    "xmm8", "xmm9"

-);

+    // Read in the data from the source pointer.

+    // First round of bit swap.

+    LABELALIGN

+  "1:                                            \n"

+    "movdqu     (%0),%%xmm0                      \n"

+    "movdqu     (%0,%4),%%xmm1                   \n"

+    "lea        (%0,%4,2),%0                     \n"

+    "movdqa     %%xmm0,%%xmm8                    \n"

+    "punpcklbw  %%xmm1,%%xmm0                    \n"

+    "punpckhbw  %%xmm1,%%xmm8                    \n"

+    "movdqa     %%xmm8,%%xmm1                    \n"

+    "movdqu     (%0),%%xmm2                      \n"

+    "movdqu     (%0,%4),%%xmm3                   \n"

+    "lea        (%0,%4,2),%0                     \n"

+    "movdqa     %%xmm2,%%xmm8                    \n"

+    "punpcklbw  %%xmm3,%%xmm2                    \n"

+    "punpckhbw  %%xmm3,%%xmm8                    \n"

+    "movdqa     %%xmm8,%%xmm3                    \n"

+    "movdqu     (%0),%%xmm4                      \n"

+    "movdqu     (%0,%4),%%xmm5                   \n"

+    "lea        (%0,%4,2),%0                     \n"

+    "movdqa     %%xmm4,%%xmm8                    \n"

+    "punpcklbw  %%xmm5,%%xmm4                    \n"

+    "punpckhbw  %%xmm5,%%xmm8                    \n"

+    "movdqa     %%xmm8,%%xmm5                    \n"

+    "movdqu     (%0),%%xmm6                      \n"

+    "movdqu     (%0,%4),%%xmm7                   \n"

+    "lea        (%0,%4,2),%0                     \n"

+    "movdqa     %%xmm6,%%xmm8                    \n"

+    "punpcklbw  %%xmm7,%%xmm6                    \n"

+    "neg        %4                               \n"

+    "lea        0x10(%0,%4,8),%0                 \n"

+    "punpckhbw  %%xmm7,%%xmm8                    \n"

+    "movdqa     %%xmm8,%%xmm7                    \n"

+    "neg        %4                               \n"

+     // Second round of bit swap.

+    "movdqa     %%xmm0,%%xmm8                    \n"

+    "movdqa     %%xmm1,%%xmm9                    \n"

+    "punpckhwd  %%xmm2,%%xmm8                    \n"

+    "punpckhwd  %%xmm3,%%xmm9                    \n"

+    "punpcklwd  %%xmm2,%%xmm0                    \n"

+    "punpcklwd  %%xmm3,%%xmm1                    \n"

+    "movdqa     %%xmm8,%%xmm2                    \n"

+    "movdqa     %%xmm9,%%xmm3                    \n"

+    "movdqa     %%xmm4,%%xmm8                    \n"

+    "movdqa     %%xmm5,%%xmm9                    \n"

+    "punpckhwd  %%xmm6,%%xmm8                    \n"

+    "punpckhwd  %%xmm7,%%xmm9                    \n"

+    "punpcklwd  %%xmm6,%%xmm4                    \n"

+    "punpcklwd  %%xmm7,%%xmm5                    \n"

+    "movdqa     %%xmm8,%%xmm6                    \n"

+    "movdqa     %%xmm9,%%xmm7                    \n"

+    // Third round of bit swap.

+    // Write to the destination pointer.

+    "movdqa     %%xmm0,%%xmm8                    \n"

+    "punpckldq  %%xmm4,%%xmm0                    \n"

+    "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel

+    "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel

+    "punpckhdq  %%xmm4,%%xmm8                    \n"

+    "movlpd     %%xmm8,(%1,%5)                   \n"

+    "lea        (%1,%5,2),%1                     \n"

+    "movhpd     %%xmm8,(%2,%6)                   \n"

+    "lea        (%2,%6,2),%2                     \n"

+    "movdqa     %%xmm2,%%xmm8                    \n"

+    "punpckldq  %%xmm6,%%xmm2                    \n"

+    "movlpd     %%xmm2,(%1)                      \n"

+    "movhpd     %%xmm2,(%2)                      \n"

+    "punpckhdq  %%xmm6,%%xmm8                    \n"

+    "movlpd     %%xmm8,(%1,%5)                   \n"

+    "lea        (%1,%5,2),%1                     \n"

+    "movhpd     %%xmm8,(%2,%6)                   \n"

+    "lea        (%2,%6,2),%2                     \n"

+    "movdqa     %%xmm1,%%xmm8                    \n"

+    "punpckldq  %%xmm5,%%xmm1                    \n"

+    "movlpd     %%xmm1,(%1)                      \n"

+    "movhpd     %%xmm1,(%2)                      \n"

+    "punpckhdq  %%xmm5,%%xmm8                    \n"

+    "movlpd     %%xmm8,(%1,%5)                   \n"

+    "lea        (%1,%5,2),%1                     \n"

+    "movhpd     %%xmm8,(%2,%6)                   \n"

+    "lea        (%2,%6,2),%2                     \n"

+    "movdqa     %%xmm3,%%xmm8                    \n"

+    "punpckldq  %%xmm7,%%xmm3                    \n"

+    "movlpd     %%xmm3,(%1)                      \n"

+    "movhpd     %%xmm3,(%2)                      \n"

+    "punpckhdq  %%xmm7,%%xmm8                    \n"

+    "sub        $0x8,%3                          \n"

+    "movlpd     %%xmm8,(%1,%5)                   \n"

+    "lea        (%1,%5,2),%1                     \n"

+    "movhpd     %%xmm8,(%2,%6)                   \n"

+    "lea        (%2,%6,2),%2                     \n"

+    "jg         1b                               \n"

+    : "+r"(src),    // %0

+      "+r"(dst_a),  // %1

+      "+r"(dst_b),  // %2

+      "+r"(width)   // %3

+    : "r"((intptr_t)(src_stride)),    // %4

+      "r"((intptr_t)(dst_stride_a)),  // %5

+      "r"((intptr_t)(dst_stride_b))   // %6

+    : "memory", "cc",

+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",

+      "xmm8", "xmm9"

+  );

-#endif

-#endif

+#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)

 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus

--- a/third_party/libyuv/source/rotate_mips.cc

+++ b/third_party/libyuv/source/rotate_mips.cc

@@ -22,8 +22,8 @@

     defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \

     (_MIPS_SIM == _MIPS_SIM_ABI32)

-void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,

-                             uint8* dst, int dst_stride, int width) {

+void TransposeWx8_DSPR2(const uint8* src, int src_stride,

+                        uint8* dst, int dst_stride, int width) {

    __asm__ __volatile__ (

       ".set push                                         \n"

       ".set noreorder                                    \n"

@@ -106,8 +106,8 @@

);

-void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,

-                                  uint8* dst, int dst_stride, int width) {

+void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,

+                             uint8* dst, int dst_stride, int width) {

   __asm__ __volatile__ (

       ".set noat                                         \n"

       ".set push                                         \n"

@@ -308,10 +308,10 @@

);

-void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,

-                               uint8* dst_a, int dst_stride_a,

-                               uint8* dst_b, int dst_stride_b,

-                               int width) {

+void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,

+                          uint8* dst_a, int dst_stride_a,

+                          uint8* dst_b, int dst_stride_b,

+                          int width) {

   __asm__ __volatile__ (

       ".set push                                         \n"

       ".set noreorder                                    \n"

--- a/third_party/libyuv/source/rotate_neon.cc

+++ b/third_party/libyuv/source/rotate_neon.cc

@@ -27,7 +27,7 @@

 void TransposeWx8_NEON(const uint8* src, int src_stride,

                        uint8* dst, int dst_stride,

                        int width) {

-  const uint8* src_temp = NULL;

+  const uint8* src_temp;

   asm volatile (

     // loops are on blocks of 8. loop will stop when

     // counter gets to or below 0. starting the counter

@@ -35,7 +35,6 @@

     "sub         %5, #8                        \n"

     // handle 8x8 blocks. this should be the majority of the plane

-    ".p2align  2                               \n"

     "1:                                        \n"

       "mov         %0, %1                      \n"

@@ -230,7 +229,7 @@

     "4:                                        \n"

-    : "+r"(src_temp),          // %0

+    : "=&r"(src_temp),         // %0

       "+r"(src),               // %1

       "+r"(src_stride),        // %2

       "+r"(dst),               // %3

@@ -248,7 +247,7 @@

                          uint8* dst_a, int dst_stride_a,

                          uint8* dst_b, int dst_stride_b,

                          int width) {

-  const uint8* src_temp = NULL;

+  const uint8* src_temp;

   asm volatile (

     // loops are on blocks of 8. loop will stop when

     // counter gets to or below 0. starting the counter

@@ -256,7 +255,6 @@

     "sub         %7, #8                        \n"

     // handle 8x8 blocks. this should be the majority of the plane

-    ".p2align  2                               \n"

     "1:                                        \n"

       "mov         %0, %1                      \n"

@@ -514,7 +512,7 @@

     "4:                                        \n"

-    : "+r"(src_temp),            // %0

+    : "=&r"(src_temp),           // %0

       "+r"(src),                 // %1

       "+r"(src_stride),          // %2

       "+r"(dst_a),               // %3

--- a/third_party/libyuv/source/rotate_neon64.cc

+++ b/third_party/libyuv/source/rotate_neon64.cc

@@ -26,7 +26,7 @@

 void TransposeWx8_NEON(const uint8* src, int src_stride,

                        uint8* dst, int dst_stride, int width) {

-  const uint8* src_temp = NULL;

+  const uint8* src_temp;

   int64 width64 = (int64) width;  // Work around clang 3.4 warning.

   asm volatile (

     // loops are on blocks of 8. loop will stop when

@@ -235,7 +235,7 @@

     "4:                                          \n"

-    : "+r"(src_temp),                             // %0

+    : "=&r"(src_temp),                            // %0

       "+r"(src),                                  // %1

       "+r"(dst),                                  // %2

       "+r"(width64)                               // %3

@@ -255,7 +255,7 @@

                          uint8* dst_a, int dst_stride_a,

                          uint8* dst_b, int dst_stride_b,

                          int width) {

-  const uint8* src_temp = NULL;

+  const uint8* src_temp;

   int64 width64 = (int64) width;  // Work around clang 3.4 warning.

   asm volatile (

     // loops are on blocks of 8. loop will stop when

@@ -520,7 +520,7 @@

     "4:                                        \n"

-    : "+r"(src_temp),                             // %0

+    : "=&r"(src_temp),                            // %0

       "+r"(src),                                  // %1

       "+r"(dst_a),                                // %2

       "+r"(dst_b),                                // %3

--- a/third_party/libyuv/source/rotate_win.cc

+++ b/third_party/libyuv/source/rotate_win.cc

@@ -16,9 +16,8 @@

 extern "C" {

 #endif

-// This module is for Visual C x86.

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \

-    defined(_MSC_VER) && !defined(__clang__)

+// This module is for 32 bit Visual C x86 and clangcl

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

 __declspec(naked)

 void TransposeWx8_SSSE3(const uint8* src, int src_stride,

--- a/third_party/libyuv/source/row_any.cc

+++ b/third_party/libyuv/source/row_any.cc

@@ -22,6 +22,39 @@

 // Subsampled source needs to be increase by 1 of not even.

 #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))

+// Any 4 planes to 1 with yuvconstants

+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \

+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \

+                 const uint8* a_buf, uint8* dst_ptr,                           \

+                 const struct YuvConstants* yuvconstants,  int width) {        \

+      SIMD_ALIGNED(uint8 temp[64 * 5]);                                        \

+      memset(temp, 0, 64 * 4);  /* for msan */                                 \

+      int r = width & MASK;                                                    \

+      int n = width & ~MASK;                                                   \

+      if (n > 0) {                                                             \

+        ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \

+      }                                                                        \

+      memcpy(temp, y_buf + n, r);                                              \

+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \

+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \

+      memcpy(temp + 192, a_buf + n, r);                                        \

+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \

+               yuvconstants, MASK + 1);                                        \

+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \

+             SS(r, DUVSHIFT) * BPP);                                           \

+    }

+#ifdef HAS_I422ALPHATOARGBROW_SSSE3

+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)

+#endif

+#ifdef HAS_I422ALPHATOARGBROW_AVX2

+ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)

+#endif

+#ifdef HAS_I422ALPHATOARGBROW_NEON

+ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)

+#endif

+#undef ANY41C

 // Any 3 planes to 1.

 #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \

     void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \

@@ -40,83 +73,100 @@

       memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \

              SS(r, DUVSHIFT) * BPP);                                           \

+#ifdef HAS_I422TOYUY2ROW_SSE2

+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)

+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)

+#endif

+#ifdef HAS_I422TOYUY2ROW_NEON

+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)

+#endif

+#ifdef HAS_I422TOUYVYROW_NEON

+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)

+#endif

+#ifdef HAS_BLENDPLANEROW_AVX2

+ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)

+#endif

+#ifdef HAS_BLENDPLANEROW_SSSE3

+ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)

+#endif

+#undef ANY31

+// Note that odd width replication includes 444 due to implementation

+// on arm that subsamples 444 to 422 internally.

+// Any 3 planes to 1 with yuvconstants

+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \

+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \

+                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \

+                 int width) {                                                  \

+      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \

+      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \

+      int r = width & MASK;                                                    \

+      int n = width & ~MASK;                                                   \

+      if (n > 0) {                                                             \

+        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);               \

+      }                                                                        \

+      memcpy(temp, y_buf + n, r);                                              \

+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \

+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \

+      if (width & 1) {                                                         \

+        temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \

+        temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \

+      }                                                                        \

+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192,                        \

+               yuvconstants, MASK + 1);                                        \

+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \

+             SS(r, DUVSHIFT) * BPP);                                           \

+    }

 #ifdef HAS_I422TOARGBROW_SSSE3

-ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)

+ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)

 #endif

+#ifdef HAS_I411TOARGBROW_SSSE3

+ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)

+#endif

 #ifdef HAS_I444TOARGBROW_SSSE3

-ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)

-ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)

-ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7)

-ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7)

-ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)

-ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)

-ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)

-ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)

-ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)

-ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)

-ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)

-ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)

+ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)

+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)

+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)

+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)

+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)

+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)

 #endif  // HAS_I444TOARGBROW_SSSE3

 #ifdef HAS_I422TORGB24ROW_AVX2

-ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)

+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)

 #endif

-#ifdef HAS_I422TORAWROW_AVX2

-ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15)

-#endif

-#ifdef HAS_J422TOARGBROW_SSSE3

-ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7)

-#endif

-#ifdef HAS_J422TOARGBROW_AVX2

-ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15)

-#endif

 #ifdef HAS_I422TOARGBROW_AVX2

-ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)

+ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)

 #endif

-#ifdef HAS_I422TOBGRAROW_AVX2

-ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15)

-#endif

 #ifdef HAS_I422TORGBAROW_AVX2

-ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)

+ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)

 #endif

-#ifdef HAS_I422TOABGRROW_AVX2

-ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)

-#endif

 #ifdef HAS_I444TOARGBROW_AVX2

-ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)

+ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)

 #endif

 #ifdef HAS_I411TOARGBROW_AVX2

-ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)

+ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)

 #endif

 #ifdef HAS_I422TOARGB4444ROW_AVX2

-ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)

+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)

 #endif

 #ifdef HAS_I422TOARGB1555ROW_AVX2

-ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)

+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)

 #endif

 #ifdef HAS_I422TORGB565ROW_AVX2

-ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)

+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)

 #endif

 #ifdef HAS_I422TOARGBROW_NEON

-ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)

-ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)

-ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)

-ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7)

-ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7)

-ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)

-ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)

-ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7)

-ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)

-ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)

-ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)

+ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)

+ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)

+ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)

+ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)

+ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)

+ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)

+ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)

+ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)

 #endif

-#ifdef HAS_I422TOYUY2ROW_NEON

-ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)

-#endif

-#ifdef HAS_I422TOUYVYROW_NEON

-ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)

-#endif

-#undef ANY31

+#undef ANY31C

 // Any 2 planes to 1.

 #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \

@@ -136,32 +186,6 @@

       memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

-// Biplanar to RGB.

-#ifdef HAS_NV12TOARGBROW_SSSE3

-ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)

-ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)

-#endif

-#ifdef HAS_NV12TOARGBROW_AVX2

-ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)

-ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)

-#endif

-#ifdef HAS_NV12TOARGBROW_NEON

-ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)

-ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)

-#endif

-#ifdef HAS_NV12TORGB565ROW_SSSE3

-ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)

-ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)

-#endif

-#ifdef HAS_NV12TORGB565ROW_AVX2

-ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)

-ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15)

-#endif

-#ifdef HAS_NV12TORGB565ROW_NEON

-ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)

-ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7)

-#endif

 // Merge functions.

 #ifdef HAS_MERGEUVROW_SSE2

 ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)

@@ -221,6 +245,55 @@

 #endif

 #undef ANY21

+// Any 2 planes to 1 with yuvconstants

+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \

+    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \

+                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \

+                 int width) {                                                  \

+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \

+      memset(temp, 0, 64 * 2);  /* for msan */                                 \

+      int r = width & MASK;                                                    \

+      int n = width & ~MASK;                                                   \

+      if (n > 0) {                                                             \

+        ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                     \

+      }                                                                        \

+      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \

+      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \

+             SS(r, UVSHIFT) * SBPP2);                                          \

+      ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);           \

+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

+    }

+// Biplanar to RGB.

+#ifdef HAS_NV12TOARGBROW_SSSE3

+ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)

+#endif

+#ifdef HAS_NV12TOARGBROW_AVX2

+ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)

+#endif

+#ifdef HAS_NV12TOARGBROW_NEON

+ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)

+#endif

+#ifdef HAS_NV21TOARGBROW_SSSE3

+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)

+#endif

+#ifdef HAS_NV21TOARGBROW_AVX2

+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)

+#endif

+#ifdef HAS_NV21TOARGBROW_NEON

+ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)

+#endif

+#ifdef HAS_NV12TORGB565ROW_SSSE3

+ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)

+#endif

+#ifdef HAS_NV12TORGB565ROW_AVX2

+ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)

+#endif

+#ifdef HAS_NV12TORGB565ROW_NEON

+ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)

+#endif

+#undef ANY21C

 // Any 1 to 1.

 #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \

     void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \

@@ -252,8 +325,10 @@

 ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)

 ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)

 #endif

-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)

+#if defined(HAS_ARGBTORGB565ROW_AVX2)

 ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)

+#endif

+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)

 ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)

 ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)

 #endif

@@ -269,9 +344,7 @@

 #if defined(HAS_I400TOARGBROW_AVX2)

 ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)

 #endif

-#if defined(HAS_YUY2TOARGBROW_SSSE3)

-ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)

-ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)

+#if defined(HAS_RGB24TOARGBROW_SSSE3)

 ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)

 ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)

 ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)

@@ -278,6 +351,9 @@

 ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)

 ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)

 #endif

+#if defined(HAS_RAWTORGB24ROW_SSSE3)

+ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)

+#endif

 #if defined(HAS_RGB565TOARGBROW_AVX2)

 ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)

 #endif

@@ -287,10 +363,6 @@

 #if defined(HAS_ARGB4444TOARGBROW_AVX2)

 ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)

 #endif

-#if defined(HAS_YUY2TOARGBROW_AVX2)

-ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)

-ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)

-#endif

 #if defined(HAS_ARGBTORGB24ROW_NEON)

 ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)

 ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)

@@ -299,9 +371,10 @@

 ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)

 ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)

 ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)

-ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)

-ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)

 #endif

+#if defined(HAS_RAWTORGB24ROW_NEON)

+ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)

+#endif

 #ifdef HAS_ARGBTOYROW_AVX2

 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)

 #endif

@@ -381,9 +454,6 @@

 #ifdef HAS_ARGBATTENUATEROW_SSSE3

 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)

 #endif

-#ifdef HAS_ARGBATTENUATEROW_SSE2

-ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3)

-#endif

 #ifdef HAS_ARGBUNATTENUATEROW_SSE2

 ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)

 #endif

@@ -396,8 +466,44 @@

 #ifdef HAS_ARGBATTENUATEROW_NEON

 ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)

 #endif

+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2

+ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)

+#endif

+#ifdef HAS_ARGBEXTRACTALPHAROW_NEON

+ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)

+#endif

 #undef ANY11

+// Any 1 to 1 blended.  Destination is read, modify, write.

+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \

+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \

+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \

+      memset(temp, 0, 128 * 2);  /* for YUY2 and msan */                       \

+      int r = width & MASK;                                                    \

+      int n = width & ~MASK;                                                   \

+      if (n > 0) {                                                             \

+        ANY_SIMD(src_ptr, dst_ptr, n);                                         \

+      }                                                                        \

+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \

+      memcpy(temp + 128, dst_ptr + n * BPP, r * BPP);                          \

+      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \

+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

+    }

+#ifdef HAS_ARGBCOPYALPHAROW_AVX2

+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)

+#endif

+#ifdef HAS_ARGBCOPYALPHAROW_SSE2

+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)

+#endif

+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2

+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)

+#endif

+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2

+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)

+#endif

+#undef ANY11B

 // Any 1 to 1 with parameter.

 #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \

     void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \

@@ -440,6 +546,35 @@

 #endif

 #undef ANY11P

+// Any 1 to 1 with yuvconstants

+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \

+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \

+                 const struct YuvConstants* yuvconstants, int width) {         \

+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \

+      memset(temp, 0, 128);  /* for YUY2 and msan */                           \

+      int r = width & MASK;                                                    \

+      int n = width & ~MASK;                                                   \

+      if (n > 0) {                                                             \

+        ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                           \

+      }                                                                        \

+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \

+      ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                      \

+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

+    }

+#if defined(HAS_YUY2TOARGBROW_SSSE3)

+ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)

+ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)

+#endif

+#if defined(HAS_YUY2TOARGBROW_AVX2)

+ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)

+ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)

+#endif

+#if defined(HAS_YUY2TOARGBROW_NEON)

+ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)

+ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)

+#endif

+#undef ANY11C

 // Any 1 to 1 interpolate.  Takes 2 rows of source via stride.

 #define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \

     void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \

@@ -464,14 +599,11 @@

 #ifdef HAS_INTERPOLATEROW_SSSE3

 ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)

 #endif

-#ifdef HAS_INTERPOLATEROW_SSE2

-ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)

-#endif

 #ifdef HAS_INTERPOLATEROW_NEON

 ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)

 #endif

-#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2

-ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3)

+#ifdef HAS_INTERPOLATEROW_DSPR2

+ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)

 #endif

 #undef ANY11T

@@ -496,9 +628,6 @@

 #ifdef HAS_MIRRORROW_SSSE3

 ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)

 #endif

-#ifdef HAS_MIRRORROW_SSE2

-ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15)

-#endif

 #ifdef HAS_MIRRORROW_NEON

 ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)

 #endif

@@ -548,10 +677,26 @@

         ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \

       }                                                                        \

       memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \

-      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \

+      /* repeat last 4 bytes for 422 subsampler */                             \

+      if ((width & 1) && BPP == 4 && DUVSHIFT == 1) {                          \

         memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \

-               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \

+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \

       }                                                                        \

+      /* repeat last 4 - 12 bytes for 411 subsampler */                        \

+      if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) {                   \

+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \

+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \

+        memcpy(temp + SS(r, UVSHIFT) * BPP + BPP,                              \

+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2);                    \

+      }                                                                        \

+      if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) {                   \

+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \

+               temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2);                \

+      }                                                                        \

+      if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) {                   \

+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \

+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \

+      }                                                                        \

       ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \

       memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \

       memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \

@@ -566,8 +711,8 @@

 #ifdef HAS_SPLITUVROW_NEON

 ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)

 #endif

-#ifdef HAS_SPLITUVROW_MIPS_DSPR2

-ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15)

+#ifdef HAS_SPLITUVROW_DSPR2

+ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)

 #endif

 #ifdef HAS_ARGBTOUV444ROW_SSSE3

 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)

@@ -576,9 +721,6 @@

 ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)

 ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)

 #endif

-#ifdef HAS_ARGBTOUV422ROW_SSSE3

-ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15)

-#endif

 #ifdef HAS_YUY2TOUV422ROW_SSE2

 ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)

 ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)

@@ -585,7 +727,6 @@

 #endif

 #ifdef HAS_YUY2TOUV422ROW_NEON

 ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)

-ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15)

 ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)

 ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)

 ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)

@@ -607,11 +748,11 @@

       memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \

       memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \

              SS(r, UVSHIFT) * BPP);                                            \

-      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \

+      if ((width & 1) && UVSHIFT == 0) {  /* repeat last pixel for subsample */\

         memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \

-               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \

+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \

         memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \

-               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4);                    \

+               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \

       }                                                                        \

       ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \

       memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \

@@ -620,6 +761,9 @@

 #ifdef HAS_ARGBTOUVROW_AVX2

 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)

+#endif

+#ifdef HAS_ARGBTOUVJROW_AVX2

+ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)

 #endif

 #ifdef HAS_ARGBTOUVROW_SSSE3

 ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)

--- a/third_party/libyuv/source/row_common.cc

+++ b/third_party/libyuv/source/row_common.cc

@@ -100,6 +100,20 @@

+void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 r = src_raw[0];

+    uint8 g = src_raw[1];

+    uint8 b = src_raw[2];

+    dst_rgb24[0] = b;

+    dst_rgb24[1] = g;

+    dst_rgb24[2] = r;

+    dst_rgb24 += 3;

+    src_raw += 3;

+  }

+}

 void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {

   int x;

   for (x = 0; x < width; ++x) {

@@ -419,28 +433,6 @@

 MAKEROWYJ(ARGB, 2, 1, 0, 4)

 #undef MAKEROWYJ

-void ARGBToUVJ422Row_C(const uint8* src_argb,

-                       uint8* dst_u, uint8* dst_v, int width) {

-  int x;

-  for (x = 0; x < width - 1; x += 2) {

-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;

-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;

-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;

-    dst_u[0] = RGBToUJ(ar, ag, ab);

-    dst_v[0] = RGBToVJ(ar, ag, ab);

-    src_argb += 8;

-    dst_u += 1;

-    dst_v += 1;

-  }

-  if (width & 1) {

-    uint8 ab = src_argb[0];

-    uint8 ag = src_argb[1];

-    uint8 ar = src_argb[2];

-    dst_u[0] = RGBToUJ(ar, ag, ab);

-    dst_v[0] = RGBToVJ(ar, ag, ab);

-  }

-}

 void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {

   int x;

   for (x = 0; x < width; ++x) {

@@ -644,28 +636,6 @@

-void ARGBToUV422Row_C(const uint8* src_argb,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  int x;

-  for (x = 0; x < width - 1; x += 2) {

-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;

-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;

-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;

-    dst_u[0] = RGBToU(ar, ag, ab);

-    dst_v[0] = RGBToV(ar, ag, ab);

-    src_argb += 8;

-    dst_u += 1;

-    dst_v += 1;

-  }

-  if (width & 1) {

-    uint8 ab = src_argb[0];

-    uint8 ag = src_argb[1];

-    uint8 ar = src_argb[2];

-    dst_u[0] = RGBToU(ar, ag, ab);

-    dst_v[0] = RGBToV(ar, ag, ab);

-  }

-}

 void ARGBToUV411Row_C(const uint8* src_argb,

                       uint8* dst_u, uint8* dst_v, int width) {

   int x;

@@ -679,10 +649,11 @@

     dst_u += 1;

     dst_v += 1;

+  // Odd width handling mimics 'any' function which replicates last pixel.

   if ((width & 3) == 3) {

-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;

-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;

-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;

+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;

+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;

+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;

     dst_u[0] = RGBToU(ar, ag, ab);

     dst_v[0] = RGBToV(ar, ag, ab);

   } else if ((width & 3) == 2) {

@@ -994,6 +965,9 @@

+// TODO(fbarchard): Unify these structures to be platform independent.

+// TODO(fbarchard): Generate SIMD structures from float matrix.

 // BT.601 YUV to RGB reference

 //  R = (Y - 16) * 1.164              - V * -1.596

 //  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813

@@ -1000,7 +974,6 @@

 //  B = (Y - 16) * 1.164 - U * -2.018

 // Y contribution to R,G,B.  Scale and bias.

-// TODO(fbarchard): Consider moving constants into a common header.

 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

@@ -1011,36 +984,76 @@

 #define VR -102 /* round(-1.596 * 64) */

 // Bias values to subtract 16 from Y and 128 from U and V.

-#define BB (UB * 128 + YGB)

+#define BB (UB * 128            + YGB)

 #define BG (UG * 128 + VG * 128 + YGB)

-#define BR (VR * 128 + YGB)

+#define BR            (VR * 128 + YGB)

-// C reference code that mimics the YUV assembly.

-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,

-                              uint8* b, uint8* g, uint8* r) {

-  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;

-  *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);

-  *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);

-  *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);

-}

+#if defined(__aarch64__)

+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {

+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

+  { UG, VG, UG, VG, UG, VG, UG, VG },

+  { UG, VG, UG, VG, UG, VG, UG, VG },

+  { BB, BG, BR, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {

+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

+  { VG, UG, VG, UG, VG, UG, VG, UG },

+  { VG, UG, VG, UG, VG, UG, VG, UG },

+  { BR, BG, BB, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+#elif defined(__arm__)

+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {

+  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { BB, BG, BR, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {

+  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { BR, BG, BB, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+#else

+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {

+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },

+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },

+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

+};

+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {

+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },

+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },

+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

+};

+#endif

-// C reference code that mimics the YUV assembly.

-static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {

-  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;

-  *b = Clamp((int32)(y1 + YGB) >> 6);

-  *g = Clamp((int32)(y1 + YGB) >> 6);

-  *r = Clamp((int32)(y1 + YGB) >> 6);

-}

-#undef YG

+#undef BB

+#undef BG

+#undef BR

 #undef YGB

 #undef UB

 #undef UG

 #undef VG

 #undef VR

-#undef BB

-#undef BG

-#undef BR

+#undef YG

 // JPEG YUV to RGB reference

 // *  R = Y                - V * -1.40200

@@ -1048,40 +1061,229 @@

 // *  B = Y - U * -1.77200

 // Y contribution to R,G,B.  Scale and bias.

-// TODO(fbarchard): Consider moving constants into a common header.

-#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */

-#define YGBJ 32  /* 64 / 2 */

+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */

+#define YGB 32  /* 64 / 2 */

 // U and V contributions to R,G,B.

-#define UBJ -113 /* round(-1.77200 * 64) */

-#define UGJ 22 /* round(0.34414 * 64) */

-#define VGJ 46 /* round(0.71414  * 64) */

-#define VRJ -90 /* round(-1.40200 * 64) */

+#define UB -113 /* round(-1.77200 * 64) */

+#define UG 22 /* round(0.34414 * 64) */

+#define VG 46 /* round(0.71414  * 64) */

+#define VR -90 /* round(-1.40200 * 64) */

-// Bias values to subtract 16 from Y and 128 from U and V.

-#define BBJ (UBJ * 128 + YGBJ)

-#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)

-#define BRJ (VRJ * 128 + YGBJ)

+// Bias values to round, and subtract 128 from U and V.

+#define BB (UB * 128            + YGB)

+#define BG (UG * 128 + VG * 128 + YGB)

+#define BR            (VR * 128 + YGB)

+#if defined(__aarch64__)

+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {

+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

+  { UG, VG, UG, VG, UG, VG, UG, VG },

+  { UG, VG, UG, VG, UG, VG, UG, VG },

+  { BB, BG, BR, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {

+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

+  { VG, UG, VG, UG, VG, UG, VG, UG },

+  { VG, UG, VG, UG, VG, UG, VG, UG },

+  { BR, BG, BB, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+#elif defined(__arm__)

+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {

+  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { BB, BG, BR, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {

+  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { BR, BG, BB, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+#else

+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {

+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },

+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },

+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

+};

+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {

+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },

+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },

+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

+};

+#endif

+#undef BB

+#undef BG

+#undef BR

+#undef YGB

+#undef UB

+#undef UG

+#undef VG

+#undef VR

+#undef YG

+// BT.709 YUV to RGB reference

+// *  R = Y                - V * -1.28033

+// *  G = Y - U *  0.21482 - V *  0.38059

+// *  B = Y - U * -2.12798

+// Y contribution to R,G,B.  Scale and bias.

+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */

+#define YGB 32  /* 64 / 2 */

+// TODO(fbarchard): Find way to express 2.12 instead of 2.0.

+// U and V contributions to R,G,B.

+#define UB -128 /* max(-128, round(-2.12798 * 64)) */

+#define UG 14 /* round(0.21482 * 64) */

+#define VG 24 /* round(0.38059  * 64) */

+#define VR -82 /* round(-1.28033 * 64) */

+// Bias values to round, and subtract 128 from U and V.

+#define BB (UB * 128            + YGB)

+#define BG (UG * 128 + VG * 128 + YGB)

+#define BR            (VR * 128 + YGB)

+#if defined(__aarch64__)

+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {

+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

+  { UG, VG, UG, VG, UG, VG, UG, VG },

+  { UG, VG, UG, VG, UG, VG, UG, VG },

+  { BB, BG, BR, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {

+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

+  { VG, UG, VG, UG, VG, UG, VG, UG },

+  { VG, UG, VG, UG, VG, UG, VG, UG },

+  { BR, BG, BB, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+#elif defined(__arm__)

+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {

+  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { BB, BG, BR, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {

+  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },

+  { BR, BG, BB, 0, 0, 0, 0, 0 },

+  { 0x0101 * YG, 0, 0, 0 }

+};

+#else

+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {

+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },

+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },

+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

+};

+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {

+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },

+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },

+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

+};

+#endif

+#undef BB

+#undef BG

+#undef BR

+#undef YGB

+#undef UB

+#undef UG

+#undef VG

+#undef VR

+#undef YG

 // C reference code that mimics the YUV assembly.

-static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,

-                               uint8* b, uint8* g, uint8* r) {

-  uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;

-  *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);

-  *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);

-  *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);

+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,

+                              uint8* b, uint8* g, uint8* r,

+                              const struct YuvConstants* yuvconstants) {

+#if defined(__aarch64__)

+  int ub = -yuvconstants->kUVToRB[0];

+  int ug = yuvconstants->kUVToG[0];

+  int vg = yuvconstants->kUVToG[1];

+  int vr = -yuvconstants->kUVToRB[1];

+  int bb = yuvconstants->kUVBiasBGR[0];

+  int bg = yuvconstants->kUVBiasBGR[1];

+  int br = yuvconstants->kUVBiasBGR[2];

+  int yg = yuvconstants->kYToRgb[0] / 0x0101;

+#elif defined(__arm__)

+  int ub = -yuvconstants->kUVToRB[0];

+  int ug = yuvconstants->kUVToG[0];

+  int vg = yuvconstants->kUVToG[4];

+  int vr = -yuvconstants->kUVToRB[4];

+  int bb = yuvconstants->kUVBiasBGR[0];

+  int bg = yuvconstants->kUVBiasBGR[1];

+  int br = yuvconstants->kUVBiasBGR[2];

+  int yg = yuvconstants->kYToRgb[0] / 0x0101;

+#else

+  int ub = yuvconstants->kUVToB[0];

+  int ug = yuvconstants->kUVToG[0];

+  int vg = yuvconstants->kUVToG[1];

+  int vr = yuvconstants->kUVToR[1];

+  int bb = yuvconstants->kUVBiasB[0];

+  int bg = yuvconstants->kUVBiasG[0];

+  int br = yuvconstants->kUVBiasR[0];

+  int yg = yuvconstants->kYToRgb[0];

+#endif

+  uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;

+  *b = Clamp((int32)(-(u * ub         ) + y1 + bb) >> 6);

+  *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);

+  *r = Clamp((int32)(-(         v * vr) + y1 + br) >> 6);

-#undef YGJ

-#undef YGBJ

-#undef UBJ

-#undef UGJ

-#undef VGJ

-#undef VRJ

-#undef BBJ

-#undef BGJ

-#undef BRJ

+// Y contribution to R,G,B.  Scale and bias.

+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

+// C reference code that mimics the YUV assembly.

+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {

+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;

+  *b = Clamp((int32)(y1 + YGB) >> 6);

+  *g = Clamp((int32)(y1 + YGB) >> 6);

+  *r = Clamp((int32)(y1 + YGB) >> 6);

+}

+#undef YG

+#undef YGB

 #if !defined(LIBYUV_DISABLE_NEON) && \

     (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))

 // C mimic assembly.

@@ -1090,14 +1292,17 @@

                      const uint8* src_u,

                      const uint8* src_v,

                      uint8* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     uint8 u = (src_u[0] + src_u[1] + 1) >> 1;

     uint8 v = (src_v[0] + src_v[1] + 1) >> 1;

-    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,

+             yuvconstants);

     rgb_buf[3] = 255;

-    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,

+             yuvconstants);

     rgb_buf[7] = 255;

     src_y += 2;

     src_u += 2;

@@ -1106,7 +1311,8 @@

   if (width & 1) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    rgb_buf[3] = 255;

 #else

@@ -1114,11 +1320,12 @@

                      const uint8* src_u,

                      const uint8* src_v,

                      uint8* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width; ++x) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

     src_y += 1;

     src_u += 1;

@@ -1133,14 +1340,15 @@

                      const uint8* src_u,

                      const uint8* src_v,

                      uint8* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

     YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     src_y += 2;

     src_u += 1;

@@ -1149,33 +1357,36 @@

   if (width & 1) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-void J422ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* rgb_buf,

-                     int width) {

+void I422AlphaToARGBRow_C(const uint8* src_y,

+                          const uint8* src_u,

+                          const uint8* src_v,

+                          const uint8* src_a,

+                          uint8* rgb_buf,

+                          const struct YuvConstants* yuvconstants,

+                          int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvJPixel(src_y[0], src_u[0], src_v[0],

-              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

-    rgb_buf[3] = 255;

-    YuvJPixel(src_y[1], src_u[0], src_v[0],

-              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

-    rgb_buf[7] = 255;

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    rgb_buf[3] = src_a[0];

+    YuvPixel(src_y[1], src_u[0], src_v[0],

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

+    rgb_buf[7] = src_a[1];

     src_y += 2;

     src_u += 1;

     src_v += 1;

+    src_a += 2;

     rgb_buf += 8;  // Advance 2 pixels.

   if (width & 1) {

-    YuvJPixel(src_y[0], src_u[0], src_v[0],

-              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

-    rgb_buf[3] = 255;

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    rgb_buf[3] = src_a[0];

@@ -1183,13 +1394,14 @@

                       const uint8* src_u,

                       const uint8* src_v,

                       uint8* rgb_buf,

+                      const struct YuvConstants* yuvconstants,

                       int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);

+             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);

     src_y += 2;

     src_u += 1;

     src_v += 1;

@@ -1197,36 +1409,15 @@

   if (width & 1) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

-void I422ToRAWRow_C(const uint8* src_y,

-                    const uint8* src_u,

-                    const uint8* src_v,

-                    uint8* rgb_buf,

-                    int width) {

-  int x;

-  for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);

-    YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);

-    src_y += 2;

-    src_u += 1;

-    src_v += 1;

-    rgb_buf += 6;  // Advance 2 pixels.

-  }

-  if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);

-  }

-}

 void I422ToARGB4444Row_C(const uint8* src_y,

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_argb4444,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   uint8 b0;

   uint8 g0;

@@ -1236,8 +1427,8 @@

   uint8 r1;

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);

+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);

     b0 = b0 >> 4;

     g0 = g0 >> 4;

     r0 = r0 >> 4;

@@ -1252,7 +1443,7 @@

     dst_argb4444 += 4;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);

     b0 = b0 >> 4;

     g0 = g0 >> 4;

     r0 = r0 >> 4;

@@ -1265,6 +1456,7 @@

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_argb1555,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   uint8 b0;

   uint8 g0;

@@ -1274,8 +1466,8 @@

   uint8 r1;

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);

+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);

     b0 = b0 >> 3;

     g0 = g0 >> 3;

     r0 = r0 >> 3;

@@ -1290,7 +1482,7 @@

     dst_argb1555 += 4;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);

     b0 = b0 >> 3;

     g0 = g0 >> 3;

     r0 = r0 >> 3;

@@ -1303,6 +1495,7 @@

                        const uint8* src_u,

                        const uint8* src_v,

                        uint8* dst_rgb565,

+                       const struct YuvConstants* yuvconstants,

                        int width) {

   uint8 b0;

   uint8 g0;

@@ -1312,8 +1505,8 @@

   uint8 r1;

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);

+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);

     b0 = b0 >> 3;

     g0 = g0 >> 2;

     r0 = r0 >> 3;

@@ -1328,7 +1521,7 @@

     dst_rgb565 += 4;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);

     b0 = b0 >> 3;

     g0 = g0 >> 2;

     r0 = r0 >> 3;

@@ -1340,20 +1533,21 @@

                      const uint8* src_u,

                      const uint8* src_v,

                      uint8* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 3; x += 4) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

     YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     YuvPixel(src_y[2], src_u[0], src_v[0],

-             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);

+             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);

     rgb_buf[11] = 255;

     YuvPixel(src_y[3], src_u[0], src_v[0],

-             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);

+             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);

     rgb_buf[15] = 255;

     src_y += 4;

     src_u += 1;

@@ -1362,10 +1556,10 @@

   if (width & 2) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

     YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     src_y += 2;

     rgb_buf += 8;  // Advance 2 pixels.

@@ -1372,7 +1566,7 @@

   if (width & 1) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

@@ -1380,14 +1574,15 @@

 void NV12ToARGBRow_C(const uint8* src_y,

                      const uint8* src_uv,

                      uint8* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_y[0], src_uv[0], src_uv[1],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

     YuvPixel(src_y[1], src_uv[0], src_uv[1],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     src_y += 2;

     src_uv += 2;

@@ -1395,7 +1590,7 @@

   if (width & 1) {

     YuvPixel(src_y[0], src_uv[0], src_uv[1],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

@@ -1403,17 +1598,16 @@

 void NV21ToARGBRow_C(const uint8* src_y,

                      const uint8* src_vu,

                      uint8* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_y[0], src_vu[1], src_vu[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

     YuvPixel(src_y[1], src_vu[1], src_vu[0],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     src_y += 2;

     src_vu += 2;

     rgb_buf += 8;  // Advance 2 pixels.

@@ -1420,7 +1614,7 @@

   if (width & 1) {

     YuvPixel(src_y[0], src_vu[1], src_vu[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

@@ -1428,6 +1622,7 @@

 void NV12ToRGB565Row_C(const uint8* src_y,

                        const uint8* src_uv,

                        uint8* dst_rgb565,

+                       const struct YuvConstants* yuvconstants,

                        int width) {

   uint8 b0;

   uint8 g0;

@@ -1437,8 +1632,8 @@

   uint8 r1;

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);

-    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);

+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);

+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);

     b0 = b0 >> 3;

     g0 = g0 >> 2;

     r0 = r0 >> 3;

@@ -1452,7 +1647,7 @@

     dst_rgb565 += 4;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);

+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);

     b0 = b0 >> 3;

     g0 = g0 >> 2;

     r0 = r0 >> 3;

@@ -1460,51 +1655,17 @@

-void NV21ToRGB565Row_C(const uint8* src_y,

-                       const uint8* vsrc_u,

-                       uint8* dst_rgb565,

-                       int width) {

-  uint8 b0;

-  uint8 g0;

-  uint8 r0;

-  uint8 b1;

-  uint8 g1;

-  uint8 r1;

-  int x;

-  for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);

-    YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);

-    b0 = b0 >> 3;

-    g0 = g0 >> 2;

-    r0 = r0 >> 3;

-    b1 = b1 >> 3;

-    g1 = g1 >> 2;

-    r1 = r1 >> 3;

-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |

-        (b1 << 16) | (g1 << 21) | (r1 << 27);

-    src_y += 2;

-    vsrc_u += 2;

-    dst_rgb565 += 4;  // Advance 2 pixels.

-  }

-  if (width & 1) {

-    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);

-    b0 = b0 >> 3;

-    g0 = g0 >> 2;

-    r0 = r0 >> 3;

-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);

-  }

-}

 void YUY2ToARGBRow_C(const uint8* src_yuy2,

                      uint8* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

     YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     src_yuy2 += 4;

     rgb_buf += 8;  // Advance 2 pixels.

@@ -1511,7 +1672,7 @@

   if (width & 1) {

     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

@@ -1518,14 +1679,15 @@

 void UYVYToARGBRow_C(const uint8* src_uyvy,

                      uint8* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

     YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     src_uyvy += 4;

     rgb_buf += 8;  // Advance 2 pixels.

@@ -1532,73 +1694,24 @@

   if (width & 1) {

     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-void I422ToBGRARow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* rgb_buf,

-                     int width) {

-  int x;

-  for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);

-    rgb_buf[0] = 255;

-    YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);

-    rgb_buf[4] = 255;

-    src_y += 2;

-    src_u += 1;

-    src_v += 1;

-    rgb_buf += 8;  // Advance 2 pixels.

-  }

-  if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);

-    rgb_buf[0] = 255;

-  }

-}

-void I422ToABGRRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* rgb_buf,

-                     int width) {

-  int x;

-  for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);

-    rgb_buf[3] = 255;

-    YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);

-    rgb_buf[7] = 255;

-    src_y += 2;

-    src_u += 1;

-    src_v += 1;

-    rgb_buf += 8;  // Advance 2 pixels.

-  }

-  if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);

-    rgb_buf[3] = 255;

-  }

-}

 void I422ToRGBARow_C(const uint8* src_y,

                      const uint8* src_u,

                      const uint8* src_v,

                      uint8* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);

+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);

     rgb_buf[0] = 255;

     YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);

+             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);

     rgb_buf[4] = 255;

     src_y += 2;

     src_u += 1;

@@ -1607,7 +1720,7 @@

   if (width & 1) {

     YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);

+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);

     rgb_buf[0] = 255;

@@ -1859,6 +1972,25 @@

 #undef BLEND

+#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8

+void BlendPlaneRow_C(const uint8* src0, const uint8* src1,

+                     const uint8* alpha, uint8* dst, int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);

+    dst[1] = UBLEND(src0[1], src1[1], alpha[1]);

+    src0 += 2;

+    src1 += 2;

+    alpha += 2;

+    dst += 2;

+  }

+  if (width & 1) {

+    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);

+  }

+}

+#undef UBLEND

 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24

 // Multiply source RGB by alpha and store to destination.

@@ -2015,18 +2147,18 @@

 // Blend 2 rows into 1.

-static void HalfRow_C(const uint8* src_uv, int src_uv_stride,

-                      uint8* dst_uv, int pix) {

+static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride,

+                      uint8* dst_uv, int width) {

   int x;

-  for (x = 0; x < pix; ++x) {

+  for (x = 0; x < width; ++x) {

     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;

-static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,

-                         uint16* dst_uv, int pix) {

+static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,

+                         uint16* dst_uv, int width) {

   int x;

-  for (x = 0; x < pix; ++x) {

+  for (x = 0; x < width; ++x) {

     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;

@@ -2035,27 +2167,30 @@

 void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,

                       ptrdiff_t src_stride,

                       int width, int source_y_fraction) {

-  int y1_fraction = source_y_fraction;

+  int y1_fraction = source_y_fraction ;

   int y0_fraction = 256 - y1_fraction;

   const uint8* src_ptr1 = src_ptr + src_stride;

   int x;

-  if (source_y_fraction == 0) {

+  if (y1_fraction == 0) {

     memcpy(dst_ptr, src_ptr, width);

     return;

-  if (source_y_fraction == 128) {

-    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);

+  if (y1_fraction == 128) {

+    HalfRow_C(src_ptr, src_stride, dst_ptr, width);

     return;

   for (x = 0; x < width - 1; x += 2) {

-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;

-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;

+    dst_ptr[0] =

+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;

+    dst_ptr[1] =

+        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;

     src_ptr += 2;

     src_ptr1 += 2;

     dst_ptr += 2;

   if (width & 1) {

-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;

+    dst_ptr[0] =

+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;

@@ -2071,7 +2206,7 @@

     return;

   if (source_y_fraction == 128) {

-    HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);

+    HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);

     return;

   for (x = 0; x < width - 1; x += 2) {

@@ -2088,7 +2223,7 @@

 // Use first 4 shuffler values to reorder ARGB channels.

 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,

-                      const uint8* shuffler, int pix) {

+                      const uint8* shuffler, int width) {

   int index0 = shuffler[0];

   int index1 = shuffler[1];

   int index2 = shuffler[2];

@@ -2095,7 +2230,7 @@

   int index3 = shuffler[3];

   // Shuffle a row of ARGB.

   int x;

-  for (x = 0; x < pix; ++x) {

+  for (x = 0; x < width; ++x) {

     // To support in-place conversion.

     uint8 b = src_argb[index0];

     uint8 g = src_argb[index1];

@@ -2156,10 +2291,126 @@

+void ARGBPolynomialRow_C(const uint8* src_argb,

+                         uint8* dst_argb,

+                         const float* poly,

+                         int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    float b = (float)(src_argb[0]);

+    float g = (float)(src_argb[1]);

+    float r = (float)(src_argb[2]);

+    float a = (float)(src_argb[3]);

+    float b2 = b * b;

+    float g2 = g * g;

+    float r2 = r * r;

+    float a2 = a * a;

+    float db = poly[0] + poly[4] * b;

+    float dg = poly[1] + poly[5] * g;

+    float dr = poly[2] + poly[6] * r;

+    float da = poly[3] + poly[7] * a;

+    float b3 = b2 * b;

+    float g3 = g2 * g;

+    float r3 = r2 * r;

+    float a3 = a2 * a;

+    db += poly[8] * b2;

+    dg += poly[9] * g2;

+    dr += poly[10] * r2;

+    da += poly[11] * a2;

+    db += poly[12] * b3;

+    dg += poly[13] * g3;

+    dr += poly[14] * r3;

+    da += poly[15] * a3;

+    dst_argb[0] = Clamp((int32)(db));

+    dst_argb[1] = Clamp((int32)(dg));

+    dst_argb[2] = Clamp((int32)(dr));

+    dst_argb[3] = Clamp((int32)(da));

+    src_argb += 4;

+    dst_argb += 4;

+  }

+}

+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,

+                             const uint8* luma, uint32 lumacoeff) {

+  uint32 bc = lumacoeff & 0xff;

+  uint32 gc = (lumacoeff >> 8) & 0xff;

+  uint32 rc = (lumacoeff >> 16) & 0xff;

+  int i;

+  for (i = 0; i < width - 1; i += 2) {

+    // Luminance in rows, color values in columns.

+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +

+                           src_argb[2] * rc) & 0x7F00u) + luma;

+    const uint8* luma1;

+    dst_argb[0] = luma0[src_argb[0]];

+    dst_argb[1] = luma0[src_argb[1]];

+    dst_argb[2] = luma0[src_argb[2]];

+    dst_argb[3] = src_argb[3];

+    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +

+              src_argb[6] * rc) & 0x7F00u) + luma;

+    dst_argb[4] = luma1[src_argb[4]];

+    dst_argb[5] = luma1[src_argb[5]];

+    dst_argb[6] = luma1[src_argb[6]];

+    dst_argb[7] = src_argb[7];

+    src_argb += 8;

+    dst_argb += 8;

+  }

+  if (width & 1) {

+    // Luminance in rows, color values in columns.

+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +

+                           src_argb[2] * rc) & 0x7F00u) + luma;

+    dst_argb[0] = luma0[src_argb[0]];

+    dst_argb[1] = luma0[src_argb[1]];

+    dst_argb[2] = luma0[src_argb[2]];

+    dst_argb[3] = src_argb[3];

+  }

+}

+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {

+  int i;

+  for (i = 0; i < width - 1; i += 2) {

+    dst[3] = src[3];

+    dst[7] = src[7];

+    dst += 8;

+    src += 8;

+  }

+  if (width & 1) {

+    dst[3] = src[3];

+  }

+}

+void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {

+  int i;

+  for (i = 0; i < width - 1; i += 2) {

+    dst_a[0] = src_argb[3];

+    dst_a[1] = src_argb[7];

+    dst_a += 2;

+    src_argb += 8;

+  }

+  if (width & 1) {

+    dst_a[0] = src_argb[3];

+  }

+}

+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {

+  int i;

+  for (i = 0; i < width - 1; i += 2) {

+    dst[3] = src[0];

+    dst[7] = src[1];

+    dst += 8;

+    src += 2;

+  }

+  if (width & 1) {

+    dst[3] = src[0];

+  }

+}

 // Maximum temporary width for wrappers to process at a time, in pixels.

 #define MAXTWIDTH 2048

-#if !(defined(_MSC_VER) && !defined(__clang__)) && \

+#if !(defined(_MSC_VER) && defined(_M_IX86)) && \

     defined(HAS_I422TORGB565ROW_SSSE3)

 // row_win.cc has asm version, but GCC uses 2 step wrapper.

 void I422ToRGB565Row_SSSE3(const uint8* src_y,

@@ -2166,11 +2417,12 @@

                            const uint8* src_u,

                            const uint8* src_v,

                            uint8* dst_rgb565,

+                           const struct YuvConstants* yuvconstants,

                            int width) {

   SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);

+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);

     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);

     src_y += twidth;

     src_u += twidth / 2;

@@ -2186,12 +2438,13 @@

                              const uint8* src_u,

                              const uint8* src_v,

                              uint8* dst_argb1555,

+                             const struct YuvConstants* yuvconstants,

                              int width) {

   // Row buffer for intermediate ARGB pixels.

   SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);

+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);

     ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);

     src_y += twidth;

     src_u += twidth / 2;

@@ -2207,12 +2460,13 @@

                              const uint8* src_u,

                              const uint8* src_v,

                              uint8* dst_argb4444,

+                             const struct YuvConstants* yuvconstants,

                              int width) {

   // Row buffer for intermediate ARGB pixels.

   SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);

+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);

     ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);

     src_y += twidth;

     src_u += twidth / 2;

@@ -2224,13 +2478,16 @@

 #endif

 #if defined(HAS_NV12TORGB565ROW_SSSE3)

-void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,

-                           uint8* dst_rgb565, int width) {

+void NV12ToRGB565Row_SSSE3(const uint8* src_y,

+                           const uint8* src_uv,

+                           uint8* dst_rgb565,

+                           const struct YuvConstants* yuvconstants,

+                           int width) {

   // Row buffer for intermediate ARGB pixels.

   SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);

+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);

     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);

     src_y += twidth;

     src_uv += twidth;

@@ -2240,70 +2497,22 @@

 #endif

-#if defined(HAS_NV21TORGB565ROW_SSSE3)

-void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,

-                           uint8* dst_rgb565, int width) {

-  // Row buffer for intermediate ARGB pixels.

-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

-  while (width > 0) {

-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);

-    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);

-    src_y += twidth;

-    src_vu += twidth;

-    dst_rgb565 += twidth * 2;

-    width -= twidth;

-  }

-}

-#endif

-#if defined(HAS_YUY2TOARGBROW_SSSE3)

-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {

-  // Row buffers for intermediate YUV pixels.

-  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);

-  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);

-  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);

-  while (width > 0) {

-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);

-    YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);

-    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);

-    src_yuy2 += twidth * 2;

-    dst_argb += twidth * 4;

-    width -= twidth;

-  }

-}

-#endif

-#if defined(HAS_UYVYTOARGBROW_SSSE3)

-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {

-  // Row buffers for intermediate YUV pixels.

-  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);

-  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);

-  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);

-  while (width > 0) {

-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);

-    UYVYToYRow_SSE2(src_uyvy, row_y, twidth);

-    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);

-    src_uyvy += twidth * 2;

-    dst_argb += twidth * 4;

-    width -= twidth;

-  }

-}

-#endif  // !defined(LIBYUV_DISABLE_X86)

 #if defined(HAS_I422TORGB565ROW_AVX2)

 void I422ToRGB565Row_AVX2(const uint8* src_y,

                           const uint8* src_u,

                           const uint8* src_v,

                           uint8* dst_rgb565,

+                          const struct YuvConstants* yuvconstants,

                           int width) {

   SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);

+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);

+#if defined(HAS_ARGBTORGB565ROW_AVX2)

     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);

+#else

+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);

+#endif

     src_y += twidth;

     src_u += twidth / 2;

     src_v += twidth / 2;

@@ -2318,13 +2527,18 @@

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb1555,

+                            const struct YuvConstants* yuvconstants,

                             int width) {

   // Row buffer for intermediate ARGB pixels.

   SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);

+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);

+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)

     ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);

+#else

+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);

+#endif

     src_y += twidth;

     src_u += twidth / 2;

     src_v += twidth / 2;

@@ -2339,13 +2553,18 @@

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb4444,

+                            const struct YuvConstants* yuvconstants,

                             int width) {

   // Row buffer for intermediate ARGB pixels.

   SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);

+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);

+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)

     ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);

+#else

+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);

+#endif

     src_y += twidth;

     src_u += twidth / 2;

     src_v += twidth / 2;

@@ -2360,12 +2579,13 @@

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_rgb24,

+                            const struct YuvConstants* yuvconstants,

                             int width) {

   // Row buffer for intermediate ARGB pixels.

   SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);

+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);

     // TODO(fbarchard): ARGBToRGB24Row_AVX2

     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);

     src_y += twidth;

@@ -2377,37 +2597,22 @@

 #endif

-#if defined(HAS_I422TORAWROW_AVX2)

-void I422ToRAWRow_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_raw,

-                            int width) {

-  // Row buffer for intermediate ARGB pixels.

-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);

-  while (width > 0) {

-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);

-    // TODO(fbarchard): ARGBToRAWRow_AVX2

-    ARGBToRAWRow_SSSE3(row, dst_raw, twidth);

-    src_y += twidth;

-    src_u += twidth / 2;

-    src_v += twidth / 2;

-    dst_raw += twidth * 3;

-    width -= twidth;

-  }

-}

-#endif

 #if defined(HAS_NV12TORGB565ROW_AVX2)

-void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,

-                          uint8* dst_rgb565, int width) {

+void NV12ToRGB565Row_AVX2(const uint8* src_y,

+                          const uint8* src_uv,

+                          uint8* dst_rgb565,

+                          const struct YuvConstants* yuvconstants,

+                          int width) {

   // Row buffer for intermediate ARGB pixels.

   SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);

+    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);

+#if defined(HAS_ARGBTORGB565ROW_AVX2)

     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);

+#else

+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);

+#endif

     src_y += twidth;

     src_uv += twidth;

     dst_rgb565 += twidth * 2;

@@ -2415,160 +2620,6 @@

 #endif

-#if defined(HAS_NV21TORGB565ROW_AVX2)

-void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,

-                          uint8* dst_rgb565, int width) {

-  // Row buffer for intermediate ARGB pixels.

-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);

-  while (width > 0) {

-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);

-    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);

-    src_y += twidth;

-    src_vu += twidth;

-    dst_rgb565 += twidth * 2;

-    width -= twidth;

-  }

-}

-#endif

-#if defined(HAS_YUY2TOARGBROW_AVX2)

-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {

-  // Row buffers for intermediate YUV pixels.

-  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);

-  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);

-  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);

-  while (width > 0) {

-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);

-    YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);

-    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);

-    src_yuy2 += twidth * 2;

-    dst_argb += twidth * 4;

-    width -= twidth;

-  }

-}

-#endif

-#if defined(HAS_UYVYTOARGBROW_AVX2)

-void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {

-  // Row buffers for intermediate YUV pixels.

-  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);

-  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);

-  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);

-  while (width > 0) {

-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

-    UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);

-    UYVYToYRow_AVX2(src_uyvy, row_y, twidth);

-    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);

-    src_uyvy += twidth * 2;

-    dst_argb += twidth * 4;

-    width -= twidth;

-  }

-}

-#endif  // !defined(LIBYUV_DISABLE_X86)

-void ARGBPolynomialRow_C(const uint8* src_argb,

-                         uint8* dst_argb, const float* poly,

-                         int width) {

-  int i;

-  for (i = 0; i < width; ++i) {

-    float b = (float)(src_argb[0]);

-    float g = (float)(src_argb[1]);

-    float r = (float)(src_argb[2]);

-    float a = (float)(src_argb[3]);

-    float b2 = b * b;

-    float g2 = g * g;

-    float r2 = r * r;

-    float a2 = a * a;

-    float db = poly[0] + poly[4] * b;

-    float dg = poly[1] + poly[5] * g;

-    float dr = poly[2] + poly[6] * r;

-    float da = poly[3] + poly[7] * a;

-    float b3 = b2 * b;

-    float g3 = g2 * g;

-    float r3 = r2 * r;

-    float a3 = a2 * a;

-    db += poly[8] * b2;

-    dg += poly[9] * g2;

-    dr += poly[10] * r2;

-    da += poly[11] * a2;

-    db += poly[12] * b3;

-    dg += poly[13] * g3;

-    dr += poly[14] * r3;

-    da += poly[15] * a3;

-    dst_argb[0] = Clamp((int32)(db));

-    dst_argb[1] = Clamp((int32)(dg));

-    dst_argb[2] = Clamp((int32)(dr));

-    dst_argb[3] = Clamp((int32)(da));

-    src_argb += 4;

-    dst_argb += 4;

-  }

-}

-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,

-                             const uint8* luma, uint32 lumacoeff) {

-  uint32 bc = lumacoeff & 0xff;

-  uint32 gc = (lumacoeff >> 8) & 0xff;

-  uint32 rc = (lumacoeff >> 16) & 0xff;

-  int i;

-  for (i = 0; i < width - 1; i += 2) {

-    // Luminance in rows, color values in columns.

-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +

-                           src_argb[2] * rc) & 0x7F00u) + luma;

-    const uint8* luma1;

-    dst_argb[0] = luma0[src_argb[0]];

-    dst_argb[1] = luma0[src_argb[1]];

-    dst_argb[2] = luma0[src_argb[2]];

-    dst_argb[3] = src_argb[3];

-    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +

-              src_argb[6] * rc) & 0x7F00u) + luma;

-    dst_argb[4] = luma1[src_argb[4]];

-    dst_argb[5] = luma1[src_argb[5]];

-    dst_argb[6] = luma1[src_argb[6]];

-    dst_argb[7] = src_argb[7];

-    src_argb += 8;

-    dst_argb += 8;

-  }

-  if (width & 1) {

-    // Luminance in rows, color values in columns.

-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +

-                           src_argb[2] * rc) & 0x7F00u) + luma;

-    dst_argb[0] = luma0[src_argb[0]];

-    dst_argb[1] = luma0[src_argb[1]];

-    dst_argb[2] = luma0[src_argb[2]];

-    dst_argb[3] = src_argb[3];

-  }

-}

-void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {

-  int i;

-  for (i = 0; i < width - 1; i += 2) {

-    dst[3] = src[3];

-    dst[7] = src[7];

-    dst += 8;

-    src += 8;

-  }

-  if (width & 1) {

-    dst[3] = src[3];

-  }

-}

-void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {

-  int i;

-  for (i = 0; i < width - 1; i += 2) {

-    dst[3] = src[0];

-    dst[7] = src[1];

-    dst += 8;

-    src += 2;

-  }

-  if (width & 1) {

-    dst[3] = src[0];

-  }

-}

 #ifdef __cplusplus

 }  // extern "C"

--- a/third_party/libyuv/source/row_gcc.cc

+++ b/third_party/libyuv/source/row_gcc.cc

@@ -17,7 +17,8 @@

 #endif

 // This module is for GCC x86 and x64.

-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))

 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)

@@ -120,6 +121,24 @@

   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u

};

+// Shuffle table for converting RAW to RGB24.  First 8.

+static const uvec8 kShuffleMaskRAWToRGB24_0 = {

+  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,

+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

+};

+// Shuffle table for converting RAW to RGB24.  Middle 8.

+static const uvec8 kShuffleMaskRAWToRGB24_1 = {

+  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,

+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

+};

+// Shuffle table for converting RAW to RGB24.  Last 8.

+static const uvec8 kShuffleMaskRAWToRGB24_2 = {

+  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,

+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

+};

 // Shuffle table for converting ARGB to RGB24.

 static uvec8 kShuffleMaskARGBToRGB24 = {

   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u

@@ -135,109 +154,39 @@

   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u

};

-// Shuffle table for converting ARGB to RAW.

-static uvec8 kShuffleMaskARGBToRAW_0 = {

-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u

+// YUY2 shuf 16 Y to 32 Y.

+static const lvec8 kShuffleYUY2Y = {

+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,

+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14

};

-#endif  // HAS_RGB24TOARGBROW_SSSE3

-#if defined(TESTING) && defined(__x86_64__)

-void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {

-  asm volatile (

-    ".p2align  5                               \n"

-    "mov       %%eax,%%eax                     \n"

-    "mov       %%ebx,%%ebx                     \n"

-    "mov       %%ecx,%%ecx                     \n"

-    "mov       %%edx,%%edx                     \n"

-    "mov       %%esi,%%esi                     \n"

-    "mov       %%edi,%%edi                     \n"

-    "mov       %%ebp,%%ebp                     \n"

-    "mov       %%esp,%%esp                     \n"

-    ".p2align  5                               \n"

-    "mov       %%r8d,%%r8d                     \n"

-    "mov       %%r9d,%%r9d                     \n"

-    "mov       %%r10d,%%r10d                   \n"

-    "mov       %%r11d,%%r11d                   \n"

-    "mov       %%r12d,%%r12d                   \n"

-    "mov       %%r13d,%%r13d                   \n"

-    "mov       %%r14d,%%r14d                   \n"

-    "mov       %%r15d,%%r15d                   \n"

-    ".p2align  5                               \n"

-    "lea       (%%rax),%%eax                   \n"

-    "lea       (%%rbx),%%ebx                   \n"

-    "lea       (%%rcx),%%ecx                   \n"

-    "lea       (%%rdx),%%edx                   \n"

-    "lea       (%%rsi),%%esi                   \n"

-    "lea       (%%rdi),%%edi                   \n"

-    "lea       (%%rbp),%%ebp                   \n"

-    "lea       (%%rsp),%%esp                   \n"

-    ".p2align  5                               \n"

-    "lea       (%%r8),%%r8d                    \n"

-    "lea       (%%r9),%%r9d                    \n"

-    "lea       (%%r10),%%r10d                  \n"

-    "lea       (%%r11),%%r11d                  \n"

-    "lea       (%%r12),%%r12d                  \n"

-    "lea       (%%r13),%%r13d                  \n"

-    "lea       (%%r14),%%r14d                  \n"

-    "lea       (%%r15),%%r15d                  \n"

+// YUY2 shuf 8 UV to 16 UV.

+static const lvec8 kShuffleYUY2UV = {

+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,

+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15

+};

-    ".p2align  5                               \n"

-    "lea       0x10(%%rax),%%eax               \n"

-    "lea       0x10(%%rbx),%%ebx               \n"

-    "lea       0x10(%%rcx),%%ecx               \n"

-    "lea       0x10(%%rdx),%%edx               \n"

-    "lea       0x10(%%rsi),%%esi               \n"

-    "lea       0x10(%%rdi),%%edi               \n"

-    "lea       0x10(%%rbp),%%ebp               \n"

-    "lea       0x10(%%rsp),%%esp               \n"

-    ".p2align  5                               \n"

-    "lea       0x10(%%r8),%%r8d                \n"

-    "lea       0x10(%%r9),%%r9d                \n"

-    "lea       0x10(%%r10),%%r10d              \n"

-    "lea       0x10(%%r11),%%r11d              \n"

-    "lea       0x10(%%r12),%%r12d              \n"

-    "lea       0x10(%%r13),%%r13d              \n"

-    "lea       0x10(%%r14),%%r14d              \n"

-    "lea       0x10(%%r15),%%r15d              \n"

+// UYVY shuf 16 Y to 32 Y.

+static const lvec8 kShuffleUYVYY = {

+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,

+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15

+};

-    ".p2align  5                               \n"

-    "add       0x10,%%eax                      \n"

-    "add       0x10,%%ebx                      \n"

-    "add       0x10,%%ecx                      \n"

-    "add       0x10,%%edx                      \n"

-    "add       0x10,%%esi                      \n"

-    "add       0x10,%%edi                      \n"

-    "add       0x10,%%ebp                      \n"

-    "add       0x10,%%esp                      \n"

-    ".p2align  5                               \n"

-    "add       0x10,%%r8d                      \n"

-    "add       0x10,%%r9d                      \n"

-    "add       0x10,%%r10d                     \n"

-    "add       0x10,%%r11d                     \n"

-    "add       0x10,%%r12d                     \n"

-    "add       0x10,%%r13d                     \n"

-    "add       0x10,%%r14d                     \n"

-    "add       0x10,%%r15d                     \n"

+// UYVY shuf 8 UV to 16 UV.

+static const lvec8 kShuffleUYVYUV = {

+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,

+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14

+};

-    ".p2align  2                               \n"

-  "1:                                          \n"

-    "movq      " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x8,0) ",%0            \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_y),     // %0

-    "+r"(dst_argb),  // %1

-    "+r"(pix)        // %2

-  :

-  : "memory", "cc", "xmm0", "xmm1", "xmm5"

-  );

-}

-#endif  // TESTING

+// NV21 shuf 8 VU to 16 UV.

+static const lvec8 kShuffleNV21 = {

+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

+};

+#endif  // HAS_RGB24TOARGBROW_SSSE3

 #ifdef HAS_J400TOARGBROW_SSE2

-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {

+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {

   asm volatile (

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     "pslld     $0x18,%%xmm5                    \n"

@@ -258,7 +207,7 @@

     "jg        1b                              \n"

   : "+r"(src_y),     // %0

     "+r"(dst_argb),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   :: "memory", "cc", "xmm0", "xmm1", "xmm5"

);

@@ -265,7 +214,7 @@

 #endif  // HAS_J400TOARGBROW_SSE2

 #ifdef HAS_RGB24TOARGBROW_SSSE3

-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {

+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {

   asm volatile (

     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000

     "pslld     $0x18,%%xmm5                    \n"

@@ -297,13 +246,13 @@

     "jg        1b                              \n"

   : "+r"(src_rgb24),  // %0

     "+r"(dst_argb),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "m"(kShuffleMaskRGB24ToARGB)  // %3

   : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {

+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {

   asm volatile (

     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000

     "pslld     $0x18,%%xmm5                    \n"

@@ -335,14 +284,44 @@

     "jg        1b                              \n"

   : "+r"(src_raw),   // %0

     "+r"(dst_argb),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "m"(kShuffleMaskRAWToARGB)  // %3

   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {

+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {

   asm volatile (

+   "movdqa     %3,%%xmm3                       \n"

+   "movdqa     %4,%%xmm4                       \n"

+   "movdqa     %5,%%xmm5                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"

+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"

+    "lea       " MEMLEA(0x18,0) ",%0           \n"

+    "pshufb    %%xmm3,%%xmm0                   \n"

+    "pshufb    %%xmm4,%%xmm1                   \n"

+    "pshufb    %%xmm5,%%xmm2                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"

+    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x18,1) ",%1           \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src_raw),    // %0

+    "+r"(dst_rgb24),  // %1

+    "+r"(width)       // %2

+  : "m"(kShuffleMaskRAWToRGB24_0),  // %3

+    "m"(kShuffleMaskRAWToRGB24_1),  // %4

+    "m"(kShuffleMaskRAWToRGB24_2)   // %5

+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  );

+}

+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {

+  asm volatile (

     "mov       $0x1080108,%%eax                \n"

     "movd      %%eax,%%xmm5                    \n"

     "pshufd    $0x0,%%xmm5,%%xmm5              \n"

@@ -382,7 +361,7 @@

     "jg        1b                              \n"

   : "+r"(src),  // %0

     "+r"(dst),  // %1

-    "+r"(pix)   // %2

+    "+r"(width)   // %2

   : "memory", "cc", "eax", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

@@ -389,7 +368,7 @@

);

-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {

+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {

   asm volatile (

     "mov       $0x1080108,%%eax                \n"

     "movd      %%eax,%%xmm5                    \n"

@@ -433,7 +412,7 @@

     "jg        1b                              \n"

   : "+r"(src),  // %0

     "+r"(dst),  // %1

-    "+r"(pix)   // %2

+    "+r"(width)   // %2

   : "memory", "cc", "eax", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

@@ -440,7 +419,7 @@

);

-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {

+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {

   asm volatile (

     "mov       $0xf0f0f0f,%%eax                \n"

     "movd      %%eax,%%xmm4                    \n"

@@ -471,7 +450,7 @@

     "jg        1b                              \n"

   : "+r"(src),  // %0

     "+r"(dst),  // %1

-    "+r"(pix)   // %2

+    "+r"(width)   // %2

   : "memory", "cc", "eax", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

@@ -478,7 +457,7 @@

);

-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {

+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {

   asm volatile (

     "movdqa    %3,%%xmm6                       \n"

     LABELALIGN

@@ -510,13 +489,13 @@

     "jg        1b                              \n"

   : "+r"(src),  // %0

     "+r"(dst),  // %1

-    "+r"(pix)   // %2

+    "+r"(width)   // %2

   : "m"(kShuffleMaskARGBToRGB24)  // %3

   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

);

-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {

+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {

   asm volatile (

     "movdqa    %3,%%xmm6                       \n"

     LABELALIGN

@@ -548,13 +527,13 @@

     "jg        1b                              \n"

   : "+r"(src),  // %0

     "+r"(dst),  // %1

-    "+r"(pix)   // %2

+    "+r"(width)   // %2

   : "m"(kShuffleMaskARGBToRAW)  // %3

   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

);

-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {

+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {

   asm volatile (

     "pcmpeqb   %%xmm3,%%xmm3                   \n"

     "psrld     $0x1b,%%xmm3                    \n"

@@ -585,13 +564,105 @@

     "jg        1b                              \n"

   : "+r"(src),  // %0

     "+r"(dst),  // %1

-    "+r"(pix)   // %2

+    "+r"(width)   // %2

   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {

+void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,

+                                const uint32 dither4, int width) {

   asm volatile (

+    "movd       %3,%%xmm6                      \n"

+    "punpcklbw  %%xmm6,%%xmm6                  \n"

+    "movdqa     %%xmm6,%%xmm7                  \n"

+    "punpcklwd  %%xmm6,%%xmm6                  \n"

+    "punpckhwd  %%xmm7,%%xmm7                  \n"

+    "pcmpeqb    %%xmm3,%%xmm3                  \n"

+    "psrld      $0x1b,%%xmm3                   \n"

+    "pcmpeqb    %%xmm4,%%xmm4                  \n"

+    "psrld      $0x1a,%%xmm4                   \n"

+    "pslld      $0x5,%%xmm4                    \n"

+    "pcmpeqb    %%xmm5,%%xmm5                  \n"

+    "pslld      $0xb,%%xmm5                    \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu     (%0),%%xmm0                    \n"

+    "paddusb    %%xmm6,%%xmm0                  \n"

+    "movdqa     %%xmm0,%%xmm1                  \n"

+    "movdqa     %%xmm0,%%xmm2                  \n"

+    "pslld      $0x8,%%xmm0                    \n"

+    "psrld      $0x3,%%xmm1                    \n"

+    "psrld      $0x5,%%xmm2                    \n"

+    "psrad      $0x10,%%xmm0                   \n"

+    "pand       %%xmm3,%%xmm1                  \n"

+    "pand       %%xmm4,%%xmm2                  \n"

+    "pand       %%xmm5,%%xmm0                  \n"

+    "por        %%xmm2,%%xmm1                  \n"

+    "por        %%xmm1,%%xmm0                  \n"

+    "packssdw   %%xmm0,%%xmm0                  \n"

+    "lea        0x10(%0),%0                    \n"

+    "movq       %%xmm0,(%1)                    \n"

+    "lea        0x8(%1),%1                     \n"

+    "sub        $0x4,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(width)   // %2

+  : "m"(dither4) // %3

+  : "memory", "cc",

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+  );

+}

+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2

+void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,

+                                const uint32 dither4, int width) {

+  asm volatile (

+    "vbroadcastss %3,%%xmm6                    \n"

+    "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"

+    "vpermq     $0xd8,%%ymm6,%%ymm6            \n"

+    "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"

+    "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"

+    "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"

+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

+    "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"

+    "vpslld     $0x5,%%ymm4,%%ymm4             \n"

+    "vpslld     $0xb,%%ymm3,%%ymm5             \n"

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu    (%0),%%ymm0                    \n"

+    "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"

+    "vpsrld     $0x5,%%ymm0,%%ymm2             \n"

+    "vpsrld     $0x3,%%ymm0,%%ymm1             \n"

+    "vpsrld     $0x8,%%ymm0,%%ymm0             \n"

+    "vpand      %%ymm4,%%ymm2,%%ymm2           \n"

+    "vpand      %%ymm3,%%ymm1,%%ymm1           \n"

+    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"

+    "vpor       %%ymm2,%%ymm1,%%ymm1           \n"

+    "vpor       %%ymm1,%%ymm0,%%ymm0           \n"

+    "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"

+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+    "lea        0x20(%0),%0                    \n"

+    "vmovdqu    %%xmm0,(%1)                    \n"

+    "lea        0x10(%1),%1                    \n"

+    "sub        $0x8,%2                        \n"

+    "jg         1b                             \n"

+    "vzeroupper                                \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(width)   // %2

+  : "m"(dither4) // %3

+  : "memory", "cc",

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+  );

+}

+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2

+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {

+  asm volatile (

     "pcmpeqb   %%xmm4,%%xmm4                   \n"

     "psrld     $0x1b,%%xmm4                    \n"

     "movdqa    %%xmm4,%%xmm5                   \n"

@@ -625,13 +696,13 @@

     "jg        1b                              \n"

   : "+r"(src),  // %0

     "+r"(dst),  // %1

-    "+r"(pix)   // %2

+    "+r"(width)   // %2

   :: "memory", "cc",

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

);

-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {

+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {

   asm volatile (

     "pcmpeqb   %%xmm4,%%xmm4                   \n"

     "psllw     $0xc,%%xmm4                     \n"

@@ -654,7 +725,7 @@

     "jg        1b                              \n"

   : "+r"(src),  // %0

     "+r"(dst),  // %1

-    "+r"(pix)   // %2

+    "+r"(width)   // %2

   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

);

@@ -662,7 +733,7 @@

 #ifdef HAS_ARGBTOYROW_SSSE3

 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.

-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

   asm volatile (

     "movdqa    %3,%%xmm4                       \n"

     "movdqa    %4,%%xmm5                       \n"

@@ -689,7 +760,7 @@

     "jg        1b                              \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "m"(kARGBToY),   // %3

     "m"(kAddY16)     // %4

   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

@@ -700,7 +771,7 @@

 #ifdef HAS_ARGBTOYJROW_SSSE3

 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.

 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.

-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

   asm volatile (

     "movdqa    %3,%%xmm4                       \n"

     "movdqa    %4,%%xmm5                       \n"

@@ -728,7 +799,7 @@

     "jg        1b                              \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "m"(kARGBToYJ),  // %3

     "m"(kAddYJ64)    // %4

   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

@@ -743,7 +814,7 @@

};

 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.

-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {

   asm volatile (

     "vbroadcastf128 %3,%%ymm4                  \n"

     "vbroadcastf128 %4,%%ymm5                  \n"

@@ -773,7 +844,7 @@

     "vzeroupper                                \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "m"(kARGBToY),   // %3

     "m"(kAddY16),    // %4

     "m"(kPermdARGBToY_AVX)  // %5

@@ -784,7 +855,7 @@

 #ifdef HAS_ARGBTOYJROW_AVX2

 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.

-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {

   asm volatile (

     "vbroadcastf128 %3,%%ymm4                  \n"

     "vbroadcastf128 %4,%%ymm5                  \n"

@@ -815,7 +886,7 @@

     "vzeroupper                                \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "m"(kARGBToYJ),   // %3

     "m"(kAddYJ64),    // %4

     "m"(kPermdARGBToY_AVX)  // %5

@@ -952,6 +1023,67 @@

 #endif  // HAS_ARGBTOUVROW_AVX2

+#ifdef HAS_ARGBTOUVJROW_AVX2

+void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "vbroadcastf128 %5,%%ymm5                  \n"

+    "vbroadcastf128 %6,%%ymm6                  \n"

+    "vbroadcastf128 %7,%%ymm7                  \n"

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"

+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"

+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0

+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)

+    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)

+    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)

+    "lea       " MEMLEA(0x80,0) ",%0           \n"

+    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"

+    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"

+    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"

+    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"

+    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"

+    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"

+    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"

+    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"

+    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"

+    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"

+    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"

+    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"

+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"

+    "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"

+    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"

+    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"

+    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"

+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+    "vpshufb    %8,%%ymm0,%%ymm0               \n"

+    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"

+    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "sub       $0x20,%3                        \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : "+r"(src_argb0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_argb)), // %4

+    "m"(kAddUVJ128),  // %5

+    "m"(kARGBToVJ),  // %6

+    "m"(kARGBToUJ),  // %7

+    "m"(kShufARGBToUV_AVX)  // %8

+  : "memory", "cc", NACL_R14

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+  );

+}

+#endif  // HAS_ARGBTOUVJROW_AVX2

 #ifdef HAS_ARGBTOUVJROW_SSSE3

 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

                         uint8* dst_u, uint8* dst_v, int width) {

@@ -1073,61 +1205,8 @@

 #endif  // HAS_ARGBTOUV444ROW_SSSE3

-#ifdef HAS_ARGBTOUV422ROW_SSSE3

-void ARGBToUV422Row_SSSE3(const uint8* src_argb0,

-                          uint8* dst_u, uint8* dst_v, int width) {

+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {

   asm volatile (

-    "movdqa    %4,%%xmm3                       \n"

-    "movdqa    %5,%%xmm4                       \n"

-    "movdqa    %6,%%xmm5                       \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm1,%%xmm0             \n"

-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqa    %%xmm2,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm6,%%xmm2             \n"

-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm2,%%xmm6                   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm3,%%xmm1                   \n"

-    "pmaddubsw %%xmm3,%%xmm6                   \n"

-    "phaddw    %%xmm2,%%xmm0                   \n"

-    "phaddw    %%xmm6,%%xmm1                   \n"

-    "psraw     $0x8,%%xmm0                     \n"

-    "psraw     $0x8,%%xmm1                     \n"

-    "packsswb  %%xmm1,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "movlps    %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb0),       // %0

-    "+r"(dst_u),           // %1

-    "+r"(dst_v),           // %2

-    "+rm"(width)           // %3

-  : "m"(kARGBToV),  // %4

-    "m"(kARGBToU),  // %5

-    "m"(kAddUV128)  // %6

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

-  );

-}

-#endif  // HAS_ARGBTOUV422ROW_SSSE3

-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {

-  asm volatile (

     "movdqa    %4,%%xmm5                       \n"

     "movdqa    %3,%%xmm4                       \n"

     LABELALIGN

@@ -1153,7 +1232,7 @@

     "jg        1b                              \n"

   : "+r"(src_bgra),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "m"(kBGRAToY),   // %3

     "m"(kAddY16)     // %4

   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

@@ -1221,7 +1300,7 @@

);

-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {

+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {

   asm volatile (

     "movdqa    %4,%%xmm5                       \n"

     "movdqa    %3,%%xmm4                       \n"

@@ -1248,7 +1327,7 @@

     "jg        1b                              \n"

   : "+r"(src_abgr),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "m"(kABGRToY),   // %3

     "m"(kAddY16)     // %4

   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

@@ -1255,7 +1334,7 @@

);

-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {

+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {

   asm volatile (

     "movdqa    %4,%%xmm5                       \n"

     "movdqa    %3,%%xmm4                       \n"

@@ -1282,7 +1361,7 @@

     "jg        1b                              \n"

   : "+r"(src_rgba),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "m"(kRGBAToY),   // %3

     "m"(kAddY16)     // %4

   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

@@ -1413,132 +1492,15 @@

 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)

-struct YuvConstants {

-  lvec8 kUVToB;     // 0

-  lvec8 kUVToG;     // 32

-  lvec8 kUVToR;     // 64

-  lvec16 kUVBiasB;  // 96

-  lvec16 kUVBiasG;  // 128

-  lvec16 kUVBiasR;  // 160

-  lvec16 kYToRgb;   // 192

-};

-// BT.601 YUV to RGB reference

-//  R = (Y - 16) * 1.164              - V * -1.596

-//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813

-//  B = (Y - 16) * 1.164 - U * -2.018

-// Y contribution to R,G,B.  Scale and bias.

-// TODO(fbarchard): Consider moving constants into a common header.

-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

-// U and V contributions to R,G,B.

-#define UB -128 /* max(-128, round(-2.018 * 64)) */

-#define UG 25 /* round(0.391 * 64) */

-#define VG 52 /* round(0.813 * 64) */

-#define VR -102 /* round(-1.596 * 64) */

-// Bias values to subtract 16 from Y and 128 from U and V.

-#define BB (UB * 128            + YGB)

-#define BG (UG * 128 + VG * 128 + YGB)

-#define BR            (VR * 128 + YGB)

-// BT601 constants for YUV to RGB.

-static YuvConstants SIMD_ALIGNED(kYuvConstants) = {

-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },

-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },

-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

-};

-// BT601 constants for NV21 where chroma plane is VU instead of UV.

-static YuvConstants SIMD_ALIGNED(kYvuConstants) = {

-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },

-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },

-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

-};

-#undef YG

-#undef YGB

-#undef UB

-#undef UG

-#undef VG

-#undef VR

-#undef BB

-#undef BG

-#undef BR

-// JPEG YUV to RGB reference

-// *  R = Y                - V * -1.40200

-// *  G = Y - U *  0.34414 - V *  0.71414

-// *  B = Y - U * -1.77200

-// Y contribution to R,G,B.  Scale and bias.

-// TODO(fbarchard): Consider moving constants into a common header.

-#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */

-#define YGBJ 32  /* 64 / 2 */

-// U and V contributions to R,G,B.

-#define UBJ -113 /* round(-1.77200 * 64) */

-#define UGJ 22 /* round(0.34414 * 64) */

-#define VGJ 46 /* round(0.71414  * 64) */

-#define VRJ -90 /* round(-1.40200 * 64) */

-// Bias values to subtract 16 from Y and 128 from U and V.

-#define BBJ (UBJ * 128             + YGBJ)

-#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)

-#define BRJ             (VRJ * 128 + YGBJ)

-// JPEG constants for YUV to RGB.

-YuvConstants SIMD_ALIGNED(kYuvJConstants) = {

-  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,

-    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },

-  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },

-  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,

-    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },

-  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,

-    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },

-  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,

-    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },

-  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,

-    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },

-  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,

-    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }

-};

-#undef YGJ

-#undef YGBJ

-#undef UBJ

-#undef UGJ

-#undef VGJ

-#undef VRJ

-#undef BBJ

-#undef BGJ

-#undef BRJ

-// Read 8 UV from 411

+// Read 8 UV from 444

 #define READYUV444                                                             \

     "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \

     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \

     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \

-    "punpcklbw  %%xmm1,%%xmm0                                   \n"

+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \

+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"

 // Read 4 UV from 422, upsample to 8 UV

 #define READYUV422                                                             \

@@ -1546,44 +1508,105 @@

     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \

     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \

     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \

-    "punpcklwd  %%xmm0,%%xmm0                                   \n"

+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \

+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"

-// Read 2 UV from 411, upsample to 8 UV

-#define READYUV411                                                             \

+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.

+#define READYUVA422                                                            \

     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \

     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \

+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \

+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \

+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \

+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \

+    "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \

+    "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"

+// Read 2 UV from 411, upsample to 8 UV.

+// reading 4 bytes is an msan violation.

+//    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"

+//    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)

+// pinsrw fails with drmemory

+//  __asm pinsrw     xmm0, [esi], 0        /* U */

+//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */

+#define READYUV411_TEMP                                                        \

+    "movzwl     " MEMACCESS([u_buf]) ",%[temp]                  \n"            \

+    "movd       %[temp],%%xmm0                                  \n"            \

+    MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) "       \n"            \

+    "movd       %[temp],%%xmm1                                  \n"            \

     "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \

     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \

     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \

-    "punpckldq  %%xmm0,%%xmm0                                   \n"

+    "punpckldq  %%xmm0,%%xmm0                                   \n"            \

+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"

 // Read 4 UV from NV12, upsample to 8 UV

 #define READNV12                                                               \

     "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \

     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \

-    "punpcklwd  %%xmm0,%%xmm0                                   \n"

+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \

+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"

+// Read 4 VU from NV21, upsample to 8 UV

+#define READNV21                                                               \

+    "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \

+    "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \

+    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \

+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"

+// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.

+#define READYUY2                                                               \

+    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \

+    "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \

+    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \

+    "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \

+    "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"

+// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.

+#define READUYVY                                                               \

+    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                \n"            \

+    "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \

+    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \

+    "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \

+    "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"

+#if defined(__x86_64__)

+#define YUVTORGB_SETUP(yuvconstants)                                           \

+    "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8            \n"            \

+    "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \

+    "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \

+    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \

+    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \

+    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \

+    "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"

 // Convert 8 pixels: 8 UV and 8 Y

-#define YUVTORGB(YuvConstants)                                                 \

+#define YUVTORGB(yuvconstants)                                                 \

     "movdqa     %%xmm0,%%xmm1                                   \n"            \

     "movdqa     %%xmm0,%%xmm2                                   \n"            \

     "movdqa     %%xmm0,%%xmm3                                   \n"            \

-    "movdqa     " MEMACCESS2(96, [YuvConstants]) ",%%xmm0       \n"            \

-    "pmaddubsw  " MEMACCESS([YuvConstants]) ",%%xmm1            \n"            \

+    "movdqa     %%xmm11,%%xmm0                                  \n"            \

+    "pmaddubsw  %%xmm8,%%xmm1                                   \n"            \

     "psubw      %%xmm1,%%xmm0                                   \n"            \

-    "movdqa     " MEMACCESS2(128, [YuvConstants]) ",%%xmm1      \n"            \

-    "pmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%xmm2       \n"            \

+    "movdqa     %%xmm12,%%xmm1                                  \n"            \

+    "pmaddubsw  %%xmm9,%%xmm2                                   \n"            \

     "psubw      %%xmm2,%%xmm1                                   \n"            \

-    "movdqa     " MEMACCESS2(160, [YuvConstants]) ",%%xmm2      \n"            \

-    "pmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%xmm3       \n"            \

+    "movdqa     %%xmm13,%%xmm2                                  \n"            \

+    "pmaddubsw  %%xmm10,%%xmm3                                  \n"            \

     "psubw      %%xmm3,%%xmm2                                   \n"            \

-    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \

-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \

-    "punpcklbw  %%xmm3,%%xmm3                                   \n"            \

-    "pmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%xmm3      \n"            \

-    "paddsw     %%xmm3,%%xmm0                                   \n"            \

-    "paddsw     %%xmm3,%%xmm1                                   \n"            \

-    "paddsw     %%xmm3,%%xmm2                                   \n"            \

+    "pmulhuw    %%xmm14,%%xmm4                                  \n"            \

+    "paddsw     %%xmm4,%%xmm0                                   \n"            \

+    "paddsw     %%xmm4,%%xmm1                                   \n"            \

+    "paddsw     %%xmm4,%%xmm2                                   \n"            \

     "psraw      $0x6,%%xmm0                                     \n"            \

     "psraw      $0x6,%%xmm1                                     \n"            \

     "psraw      $0x6,%%xmm2                                     \n"            \

@@ -1590,8 +1613,39 @@

     "packuswb   %%xmm0,%%xmm0                                   \n"            \

     "packuswb   %%xmm1,%%xmm1                                   \n"            \

     "packuswb   %%xmm2,%%xmm2                                   \n"

+#define YUVTORGB_REGS \

+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",

-// Store 8 ARGB values. Assumes XMM5 is zero.

+#else

+#define YUVTORGB_SETUP(yuvconstants)

+// Convert 8 pixels: 8 UV and 8 Y

+#define YUVTORGB(yuvconstants)                                                 \

+    "movdqa     %%xmm0,%%xmm1                                   \n"            \

+    "movdqa     %%xmm0,%%xmm2                                   \n"            \

+    "movdqa     %%xmm0,%%xmm3                                   \n"            \

+    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \

+    "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \

+    "psubw      %%xmm1,%%xmm0                                   \n"            \

+    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \

+    "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \

+    "psubw      %%xmm2,%%xmm1                                   \n"            \

+    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \

+    "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \

+    "psubw      %%xmm3,%%xmm2                                   \n"            \

+    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \

+    "paddsw     %%xmm4,%%xmm0                                   \n"            \

+    "paddsw     %%xmm4,%%xmm1                                   \n"            \

+    "paddsw     %%xmm4,%%xmm2                                   \n"            \

+    "psraw      $0x6,%%xmm0                                     \n"            \

+    "psraw      $0x6,%%xmm1                                     \n"            \

+    "psraw      $0x6,%%xmm2                                     \n"            \

+    "packuswb   %%xmm0,%%xmm0                                   \n"            \

+    "packuswb   %%xmm1,%%xmm1                                   \n"            \

+    "packuswb   %%xmm2,%%xmm2                                   \n"

+#define YUVTORGB_REGS

+#endif

+// Store 8 ARGB values.

 #define STOREARGB                                                              \

     "punpcklbw  %%xmm1,%%xmm0                                    \n"           \

     "punpcklbw  %%xmm5,%%xmm2                                    \n"           \

@@ -1602,30 +1656,7 @@

     "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \

     "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"

-// Store 8 BGRA values. Assumes XMM5 is zero.

-#define STOREBGRA                                                              \

-    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \

-    "punpcklbw %%xmm0,%%xmm1                                     \n"           \

-    "punpcklbw %%xmm2,%%xmm5                                     \n"           \

-    "movdqa    %%xmm5,%%xmm0                                     \n"           \

-    "punpcklwd %%xmm1,%%xmm5                                     \n"           \

-    "punpckhwd %%xmm1,%%xmm0                                     \n"           \

-    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \

-    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_bgra]) "           \n"           \

-    "lea       " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra]         \n"

-// Store 8 ABGR values. Assumes XMM5 is zero.

-#define STOREABGR                                                              \

-    "punpcklbw %%xmm1,%%xmm2                                     \n"           \

-    "punpcklbw %%xmm5,%%xmm0                                     \n"           \

-    "movdqa    %%xmm2,%%xmm1                                     \n"           \

-    "punpcklwd %%xmm0,%%xmm2                                     \n"           \

-    "punpckhwd %%xmm0,%%xmm1                                     \n"           \

-    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \

-    "movdqu    %%xmm1," MEMACCESS2(0x10, [dst_abgr]) "           \n"           \

-    "lea       " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr]         \n"

-// Store 8 RGBA values. Assumes XMM5 is zero.

+// Store 8 RGBA values.

 #define STORERGBA                                                              \

     "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \

     "punpcklbw %%xmm2,%%xmm1                                     \n"           \

@@ -1641,14 +1672,16 @@

                                 const uint8* u_buf,

                                 const uint8* v_buf,

                                 uint8* dst_argb,

+                                const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

   "1:                                          \n"

     READYUV444

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(yuvconstants)

     STOREARGB

     "sub       $0x8,%[width]                   \n"

     "jg        1b                              \n"

@@ -1657,19 +1690,20 @@

     [v_buf]"+r"(v_buf),    // %[v_buf]

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-// TODO(fbarchard): Consider putting masks into constants.

 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,

                                  const uint8* u_buf,

                                  const uint8* v_buf,

                                  uint8* dst_rgb24,

+                                 const struct YuvConstants* yuvconstants,

                                  int width) {

   asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"

     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"

     "sub       %[u_buf],%[v_buf]               \n"

@@ -1676,7 +1710,7 @@

     LABELALIGN

   "1:                                          \n"

     READYUV422

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(yuvconstants)

     "punpcklbw %%xmm1,%%xmm0                   \n"

     "punpcklbw %%xmm2,%%xmm2                   \n"

     "movdqa    %%xmm0,%%xmm1                   \n"

@@ -1694,76 +1728,33 @@

     [u_buf]"+r"(u_buf),    // %[u_buf]

     [v_buf]"+r"(v_buf),    // %[v_buf]

     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]

-// TODO(fbarchard): Make width a register for 32 bit.

 #if defined(__i386__) && defined(__pic__)

     [width]"+m"(width)     // %[width]

 #else

     [width]"+rm"(width)    // %[width]

 #endif

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),

+  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]

     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),

     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

);

-void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,

-                               const uint8* u_buf,

-                               const uint8* v_buf,

-                               uint8* dst_raw,

-                               int width) {

-  asm volatile (

-    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"

-    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"

-    "sub       %[u_buf],%[v_buf]               \n"

-    LABELALIGN

-  "1:                                          \n"

-    READYUV422

-    YUVTORGB(kYuvConstants)

-    "punpcklbw %%xmm1,%%xmm0                   \n"

-    "punpcklbw %%xmm2,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklwd %%xmm2,%%xmm0                   \n"

-    "punpckhwd %%xmm2,%%xmm1                   \n"

-    "pshufb    %%xmm5,%%xmm0                   \n"

-    "pshufb    %%xmm6,%%xmm1                   \n"

-    "palignr   $0xc,%%xmm0,%%xmm1              \n"

-    "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"

-    "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"

-    "subl      $0x8,%[width]                   \n"

-    "jg        1b                              \n"

-  : [y_buf]"+r"(y_buf),    // %[y_buf]

-    [u_buf]"+r"(u_buf),    // %[u_buf]

-    [v_buf]"+r"(v_buf),    // %[v_buf]

-    [dst_raw]"+r"(dst_raw),  // %[dst_raw]

-// TODO(fbarchard): Make width a register for 32 bit.

-#if defined(__i386__) && defined(__pic__)

-    [width]"+m"(width)    // %[width]

-#else

-    [width]"+rm"(width)    // %[width]

-#endif

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),

-    [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),

-    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"

-  );

-}

 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,

                                 const uint8* u_buf,

                                 const uint8* v_buf,

                                 uint8* dst_argb,

+                                const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

   "1:                                          \n"

     READYUV422

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(yuvconstants)

     STOREARGB

     "sub       $0x8,%[width]                   \n"

     "jg        1b                              \n"

@@ -1772,74 +1763,95 @@

     [v_buf]"+r"(v_buf),    // %[v_buf]

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,

-                                const uint8* u_buf,

-                                const uint8* v_buf,

-                                uint8* dst_argb,

-                                int width) {

+#ifdef HAS_I422ALPHATOARGBROW_SSSE3

+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,

+                                     const uint8* u_buf,

+                                     const uint8* v_buf,

+                                     const uint8* a_buf,

+                                     uint8* dst_argb,

+                                     const struct YuvConstants* yuvconstants,

+                                     int width) {

   asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

   "1:                                          \n"

-    READYUV422

-    YUVTORGB(kYuvConstants)

+    READYUVA422

+    YUVTORGB(yuvconstants)

     STOREARGB

-    "sub       $0x8,%[width]                   \n"

+    "subl      $0x8,%[width]                   \n"

     "jg        1b                              \n"

   : [y_buf]"+r"(y_buf),    // %[y_buf]

     [u_buf]"+r"(u_buf),    // %[u_buf]

     [v_buf]"+r"(v_buf),    // %[v_buf]

+    [a_buf]"+r"(a_buf),    // %[a_buf]

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+#if defined(__i386__) && defined(__pic__)

+    [width]"+m"(width)     // %[width]

+#else

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+#endif  // HAS_I422ALPHATOARGBROW_SSSE3

+#ifdef HAS_I411TOARGBROW_SSSE3

 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,

                                 const uint8* u_buf,

                                 const uint8* v_buf,

                                 uint8* dst_argb,

+                                const struct YuvConstants* yuvconstants,

                                 int width) {

+  int temp;

   asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

   "1:                                          \n"

-    READYUV411

-    YUVTORGB(kYuvConstants)

+    READYUV411_TEMP

+    YUVTORGB(yuvconstants)

     STOREARGB

-    "sub       $0x8,%[width]                   \n"

+    "subl      $0x8,%[width]                   \n"

     "jg        1b                              \n"

-  : [y_buf]"+r"(y_buf),    // %[y_buf]

-    [u_buf]"+r"(u_buf),    // %[u_buf]

-    [v_buf]"+r"(v_buf),    // %[v_buf]

+  : [y_buf]"+r"(y_buf),        // %[y_buf]

+    [u_buf]"+r"(u_buf),        // %[u_buf]

+    [v_buf]"+r"(v_buf),        // %[v_buf]

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

-    [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+    [temp]"=&r"(temp),         // %[temp]

+#if defined(__i386__) && defined(__pic__)

+    [width]"+m"(width)         // %[width]

+#else

+    [width]"+rm"(width)        // %[width]

+#endif

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+#endif

 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,

                                 const uint8* uv_buf,

                                 uint8* dst_argb,

+                                const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

   "1:                                          \n"

     READNV12

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(yuvconstants)

     STOREARGB

     "sub       $0x8,%[width]                   \n"

     "jg        1b                              \n"

@@ -1847,84 +1859,85 @@

     [uv_buf]"+r"(uv_buf),    // %[uv_buf]

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

-  // Does not use r14.

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.

+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,

-                                const uint8* uv_buf,

+                                const uint8* vu_buf,

                                 uint8* dst_argb,

+                                const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

   "1:                                          \n"

-    READNV12

-    YUVTORGB(kYuvConstants)

+    READNV21

+    YUVTORGB(yuvconstants)

     STOREARGB

     "sub       $0x8,%[width]                   \n"

     "jg        1b                              \n"

   : [y_buf]"+r"(y_buf),    // %[y_buf]

-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]

+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]

-  // Does not use r14.

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

+    [kShuffleNV21]"m"(kShuffleNV21)

+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.

+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,

-                                const uint8* u_buf,

-                                const uint8* v_buf,

-                                uint8* dst_bgra,

+void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,

+                                uint8* dst_argb,

+                                const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

-    "sub       %[u_buf],%[v_buf]               \n"

+    YUVTORGB_SETUP(yuvconstants)

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

   "1:                                          \n"

-    READYUV422

-    YUVTORGB(kYuvConstants)

-    STOREBGRA

+    READYUY2

+    YUVTORGB(yuvconstants)

+    STOREARGB

     "sub       $0x8,%[width]                   \n"

     "jg        1b                              \n"

-  : [y_buf]"+r"(y_buf),    // %[y_buf]

-    [u_buf]"+r"(u_buf),    // %[u_buf]

-    [v_buf]"+r"(v_buf),    // %[v_buf]

-    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]

+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),

+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)

+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.

+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,

-                                const uint8* u_buf,

-                                const uint8* v_buf,

-                                uint8* dst_abgr,

+void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,

+                                uint8* dst_argb,

+                                const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

-    "sub       %[u_buf],%[v_buf]               \n"

+    YUVTORGB_SETUP(yuvconstants)

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

   "1:                                          \n"

-    READYUV422

-    YUVTORGB(kYuvConstants)

-    STOREABGR

+    READUYVY

+    YUVTORGB(yuvconstants)

+    STOREARGB

     "sub       $0x8,%[width]                   \n"

     "jg        1b                              \n"

-  : [y_buf]"+r"(y_buf),    // %[y_buf]

-    [u_buf]"+r"(u_buf),    // %[u_buf]

-    [v_buf]"+r"(v_buf),    // %[v_buf]

-    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]

+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

+    [kShuffleUYVYY]"m"(kShuffleUYVYY),

+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)

+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.

+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

@@ -1932,14 +1945,16 @@

                                 const uint8* u_buf,

                                 const uint8* v_buf,

                                 uint8* dst_rgba,

+                                const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

   "1:                                          \n"

     READYUV422

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(yuvconstants)

     STORERGBA

     "sub       $0x8,%[width]                   \n"

     "jg        1b                              \n"

@@ -1948,76 +1963,191 @@

     [v_buf]"+r"(v_buf),    // %[v_buf]

     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

 #endif  // HAS_I422TOARGBROW_SSSE3

+// Read 16 UV from 444

+#define READYUV444_AVX2                                                        \

+    "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \

+    MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \

+    "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \

+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

+    "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \

+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \

+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"

 // Read 8 UV from 422, upsample to 16 UV.

 #define READYUV422_AVX2                                                        \

-    "vmovq       " MEMACCESS([u_buf]) ",%%xmm0                      \n"        \

+    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \

     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \

     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \

     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \

     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"

+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \

+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"

-// Convert 16 pixels: 16 UV and 16 Y.

-#define YUVTORGB_AVX2(YuvConstants)                                            \

-    "vpmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2   \n"        \

-    "vpmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1   \n"        \

-    "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \

-    "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \

+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.

+#define READYUVA422_AVX2                                                       \

+    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \

+    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \

+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \

+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \

+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \

+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \

+    "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \

+    "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \

+    "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"

+// Read 4 UV from 411, upsample to 16 UV.

+#define READYUV411_AVX2                                                        \

+    "vmovd      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \

+    MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1)                           \

+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]                   \n"        \

+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \

+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \

+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

+    "vpunpckldq %%ymm0,%%ymm0,%%ymm0                                \n"        \

+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"

+// Read 8 UV from NV12, upsample to 16 UV.

+#define READNV12_AVX2                                                          \

+    "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                      \n"        \

+    "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \

+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \

+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"

+// Read 8 VU from NV21, upsample to 16 UV.

+#define READNV21_AVX2                                                          \

+    "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \

+    "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \

+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

+    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \

+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"

+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.

+#define READYUY2_AVX2                                                          \

+    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \

+    "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \

+    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \

+    "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \

+    "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"

+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.

+#define READUYVY_AVX2                                                          \

+    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                   \n"        \

+    "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \

+    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \

+    "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \

+    "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"

+#if defined(__x86_64__)

+#define YUVTORGB_SETUP_AVX2(yuvconstants)                                      \

+    "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8            \n"           \

+    "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \

+    "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \

+    "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \

+    "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \

+    "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \

+    "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"

+#define YUVTORGB_AVX2(yuvconstants)                                            \

+    "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n"        \

+    "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n"        \

+    "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n"        \

+    "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n"        \

+    "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n"        \

+    "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n"        \

+    "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n"        \

+    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \

+    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \

+    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \

+    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \

+    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \

+    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \

+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \

+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \

+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"

+#define YUVTORGB_REGS_AVX2 \

+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",

+#else  // Convert 16 pixels: 16 UV and 16 Y.

+#define YUVTORGB_SETUP_AVX2(yuvconstants)

+#define YUVTORGB_AVX2(yuvconstants)                                            \

+    "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2   \n"        \

+    "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \

+    "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \

+    "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \

     "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \

-    "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm3         \n"        \

+    "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \

     "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \

-    "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm3          \n"        \

+    "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \

     "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \

-    "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \

-    "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \

-    "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \

-    "vpunpcklbw  %%ymm3,%%ymm3,%%ymm3                               \n"        \

-    "vpmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3  \n"        \

-    "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \

-    "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \

-    "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \

-    "vpsraw      $0x6,%%ymm0,%%ymm0             \n"                            \

-    "vpsraw      $0x6,%%ymm1,%%ymm1             \n"                            \

-    "vpsraw      $0x6,%%ymm2,%%ymm2             \n"                            \

-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0           \n"                            \

-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1           \n"                            \

-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2           \n"

+    "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \

+    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \

+    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \

+    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \

+    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \

+    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \

+    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \

+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \

+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \

+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"

+#define YUVTORGB_REGS_AVX2

+#endif

-#if defined(HAS_I422TOBGRAROW_AVX2)

+// Store 16 ARGB values.

+#define STOREARGB_AVX2                                                         \

+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \

+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \

+    "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \

+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \

+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \

+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \

+    "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \

+    "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"

+#ifdef HAS_I444TOARGBROW_AVX2

 // 16 pixels

-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).

-void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,

+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).

+void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,

                                const uint8* u_buf,

                                const uint8* v_buf,

-                               uint8* dst_bgra,

+                               uint8* dst_argb,

+                               const struct YuvConstants* yuvconstants,

                                int width) {

   asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

     LABELALIGN

   "1:                                          \n"

-    READYUV422_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

-    // Step 3: Weave into BGRA

-    "vpunpcklbw %%ymm0,%%ymm1,%%ymm1           \n"  // GB

-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"

-    "vpunpcklbw %%ymm2,%%ymm5,%%ymm2           \n"  // AR

-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"

-    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"  // ARGB first 8 pixels

-    "vpunpckhwd %%ymm1,%%ymm2,%%ymm2           \n"  // ARGB next 8 pixels

-    "vmovdqu    %%ymm0," MEMACCESS([dst_bgra]) "\n"

-    "vmovdqu    %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"

-    "lea       " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"

+    READYUV444_AVX2

+    YUVTORGB_AVX2(yuvconstants)

+    STOREARGB_AVX2

     "sub       $0x10,%[width]                  \n"

     "jg        1b                              \n"

     "vzeroupper                                \n"

@@ -2024,42 +2154,33 @@

   : [y_buf]"+r"(y_buf),    // %[y_buf]

     [u_buf]"+r"(u_buf),    // %[u_buf]

     [v_buf]"+r"(v_buf),    // %[v_buf]

-    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-#endif  // HAS_I422TOBGRAROW_AVX2

+#endif  // HAS_I444TOARGBROW_AVX2

-#if defined(HAS_I422TOARGBROW_AVX2)

+#ifdef HAS_I411TOARGBROW_AVX2

 // 16 pixels

-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

-void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,

+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

+void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,

                                const uint8* u_buf,

                                const uint8* v_buf,

                                uint8* dst_argb,

+                               const struct YuvConstants* yuvconstants,

                                int width) {

   asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

     LABELALIGN

   "1:                                          \n"

-    READYUV422_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

-    // Step 3: Weave into ARGB

-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA

-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"

-    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels

-    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels

-    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"

-    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"

-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"

+    READYUV411_AVX2

+    YUVTORGB_AVX2(yuvconstants)

+    STOREARGB_AVX2

     "sub       $0x10,%[width]                  \n"

     "jg        1b                              \n"

     "vzeroupper                                \n"

@@ -2068,40 +2189,31 @@

     [v_buf]"+r"(v_buf),    // %[v_buf]

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-#endif  // HAS_I422TOARGBROW_AVX2

+#endif  // HAS_I411TOARGBROW_AVX2

-#if defined(HAS_J422TOARGBROW_AVX2)

+#if defined(HAS_I422TOARGBROW_AVX2)

 // 16 pixels

 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

-void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,

+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,

                                const uint8* u_buf,

                                const uint8* v_buf,

                                uint8* dst_argb,

+                               const struct YuvConstants* yuvconstants,

                                int width) {

   asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

     LABELALIGN

   "1:                                          \n"

     READYUV422_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

-    // Step 3: Weave into ARGB

-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA

-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"

-    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels

-    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels

-    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"

-    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"

-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"

+    YUVTORGB_AVX2(yuvconstants)

+    STOREARGB_AVX2

     "sub       $0x10,%[width]                  \n"

     "jg        1b                              \n"

     "vzeroupper                                \n"

@@ -2110,53 +2222,50 @@

     [v_buf]"+r"(v_buf),    // %[v_buf]

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB)  // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-#endif  // HAS_J422TOARGBROW_AVX2

+#endif  // HAS_I422TOARGBROW_AVX2

-#if defined(HAS_I422TOABGRROW_AVX2)

+#if defined(HAS_I422ALPHATOARGBROW_AVX2)

 // 16 pixels

-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).

-void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,

+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.

+void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,

                                const uint8* u_buf,

                                const uint8* v_buf,

+                               const uint8* a_buf,

                                uint8* dst_argb,

+                               const struct YuvConstants* yuvconstants,

                                int width) {

   asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

     LABELALIGN

   "1:                                          \n"

-    READYUV422_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

-    // Step 3: Weave into ABGR

-    "vpunpcklbw %%ymm1,%%ymm2,%%ymm1           \n"  // RG

-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"

-    "vpunpcklbw %%ymm5,%%ymm0,%%ymm2           \n"  // BA

-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"

-    "vpunpcklwd %%ymm2,%%ymm1,%%ymm0           \n"  // RGBA first 8 pixels

-    "vpunpckhwd %%ymm2,%%ymm1,%%ymm1           \n"  // RGBA next 8 pixels

-    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"

-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"

-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"

-    "sub       $0x10,%[width]                  \n"

+    READYUVA422_AVX2

+    YUVTORGB_AVX2(yuvconstants)

+    STOREARGB_AVX2

+    "subl      $0x10,%[width]                  \n"

     "jg        1b                              \n"

     "vzeroupper                                \n"

   : [y_buf]"+r"(y_buf),    // %[y_buf]

     [u_buf]"+r"(u_buf),    // %[u_buf]

     [v_buf]"+r"(v_buf),    // %[v_buf]

+    [a_buf]"+r"(a_buf),    // %[a_buf]

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+#if defined(__i386__) && defined(__pic__)

+    [width]"+m"(width)     // %[width]

+#else

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-#endif  // HAS_I422TOABGRROW_AVX2

+#endif  // HAS_I422ALPHATOARGBROW_AVX2

 #if defined(HAS_I422TORGBAROW_AVX2)

 // 16 pixels

@@ -2165,14 +2274,16 @@

                                const uint8* u_buf,

                                const uint8* v_buf,

                                uint8* dst_argb,

+                               const struct YuvConstants* yuvconstants,

                                int width) {

   asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

     LABELALIGN

   "1:                                          \n"

     READYUV422_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

+    YUVTORGB_AVX2(yuvconstants)

     // Step 3: Weave into RGBA

     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"

@@ -2192,13 +2303,134 @@

     [v_buf]"+r"(v_buf),    // %[v_buf]

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

 #endif  // HAS_I422TORGBAROW_AVX2

+#if defined(HAS_NV12TOARGBROW_AVX2)

+// 16 pixels.

+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

+void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,

+                               const uint8* uv_buf,

+                               uint8* dst_argb,

+                               const struct YuvConstants* yuvconstants,

+                               int width) {

+  asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+    LABELALIGN

+  "1:                                          \n"

+    READNV12_AVX2

+    YUVTORGB_AVX2(yuvconstants)

+    STOREARGB_AVX2

+    "sub       $0x10,%[width]                  \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.

+    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  );

+}

+#endif  // HAS_NV12TOARGBROW_AVX2

+#if defined(HAS_NV21TOARGBROW_AVX2)

+// 16 pixels.

+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

+void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,

+                               const uint8* vu_buf,

+                               uint8* dst_argb,

+                               const struct YuvConstants* yuvconstants,

+                               int width) {

+  asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+    LABELALIGN

+  "1:                                          \n"

+    READNV21_AVX2

+    YUVTORGB_AVX2(yuvconstants)

+    STOREARGB_AVX2

+    "sub       $0x10,%[width]                  \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

+    [kShuffleNV21]"m"(kShuffleNV21)

+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.

+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  );

+}

+#endif  // HAS_NV21TOARGBROW_AVX2

+#if defined(HAS_YUY2TOARGBROW_AVX2)

+// 16 pixels.

+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).

+void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,

+                               uint8* dst_argb,

+                               const struct YuvConstants* yuvconstants,

+                               int width) {

+  asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUY2_AVX2

+    YUVTORGB_AVX2(yuvconstants)

+    STOREARGB_AVX2

+    "sub       $0x10,%[width]                  \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),

+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)

+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.

+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  );

+}

+#endif  // HAS_YUY2TOARGBROW_AVX2

+#if defined(HAS_UYVYTOARGBROW_AVX2)

+// 16 pixels.

+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).

+void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,

+                               uint8* dst_argb,

+                               const struct YuvConstants* yuvconstants,

+                               int width) {

+  asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+    LABELALIGN

+  "1:                                          \n"

+    READUYVY_AVX2

+    YUVTORGB_AVX2(yuvconstants)

+    STOREARGB_AVX2

+    "sub       $0x10,%[width]                  \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

+    [kShuffleUYVYY]"m"(kShuffleUYVYY),

+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)

+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.

+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  );

+}

+#endif  // HAS_UYVYTOARGBROW_AVX2

 #ifdef HAS_I400TOARGBROW_SSE2

 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {

   asm volatile (

@@ -2344,35 +2576,7 @@

 #endif  // HAS_MIRRORROW_AVX2

-#ifdef HAS_MIRRORROW_SSE2

-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {

-  intptr_t temp_width = (intptr_t)(width);

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "psllw     $0x8,%%xmm0                     \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "por       %%xmm1,%%xmm0                   \n"

-    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"

-    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"

-    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1)",%1            \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(temp_width)  // %2

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1"

-  );

-}

-#endif  // HAS_MIRRORROW_SSE2

-#ifdef HAS_MIRRORROW_UV_SSSE3

+#ifdef HAS_MIRRORUVROW_SSSE3

 // Shuffle table for reversing the bytes of UV channels.

 static uvec8 kShuffleMirrorUV = {

   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u

@@ -2403,7 +2607,7 @@

     "xmm0", "xmm1"

);

-#endif  // HAS_MIRRORROW_UV_SSSE3

+#endif  // HAS_MIRRORUVROW_SSSE3

 #ifdef HAS_ARGBMIRRORROW_SSE2

@@ -2458,7 +2662,8 @@

 #endif  // HAS_ARGBMIRRORROW_AVX2

 #ifdef HAS_SPLITUVROW_AVX2

-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {

+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width) {

   asm volatile (

     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"

     "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"

@@ -2485,7 +2690,7 @@

   : "+r"(src_uv),     // %0

     "+r"(dst_u),      // %1

     "+r"(dst_v),      // %2

-    "+r"(pix)         // %3

+    "+r"(width)         // %3

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

@@ -2494,7 +2699,8 @@

 #endif  // HAS_SPLITUVROW_AVX2

 #ifdef HAS_SPLITUVROW_SSE2

-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {

+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width) {

   asm volatile (

     "pcmpeqb    %%xmm5,%%xmm5                    \n"

     "psrlw      $0x8,%%xmm5                      \n"

@@ -2520,7 +2726,7 @@

   : "+r"(src_uv),     // %0

     "+r"(dst_u),      // %1

     "+r"(dst_v),      // %2

-    "+r"(pix)         // %3

+    "+r"(width)         // %3

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

@@ -2591,8 +2797,23 @@

 #ifdef HAS_COPYROW_SSE2

 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {

   asm volatile (

+    "test       $0xf,%0                        \n"

+    "jne        2f                             \n"

+    "test       $0xf,%1                        \n"

+    "jne        2f                             \n"

     LABELALIGN

   "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "sub       $0x20,%2                        \n"

+    "jg        1b                              \n"

+    "jmp       9f                              \n"

+    LABELALIGN

+  "2:                                          \n"

     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

     "lea       " MEMLEA(0x20,0) ",%0           \n"

@@ -2600,7 +2821,8 @@

     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

     "lea       " MEMLEA(0x20,1) ",%1           \n"

     "sub       $0x20,%2                        \n"

-    "jg        1b                              \n"

+    "jg        2b                              \n"

+  "9:                                          \n"

   : "+r"(src),   // %0

     "+r"(dst),   // %1

     "+r"(count)  // %2

@@ -2714,6 +2936,33 @@

 #endif  // HAS_ARGBCOPYALPHAROW_AVX2

+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2

+// width in pixels

+void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {

+ asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ", %%xmm0        \n"

+    "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"

+    "lea       " MEMLEA(0x20, 0) ", %0         \n"

+    "psrld     $0x18, %%xmm0                   \n"

+    "psrld     $0x18, %%xmm1                   \n"

+    "packssdw  %%xmm1, %%xmm0                  \n"

+    "packuswb  %%xmm0, %%xmm0                  \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x8, 1) ", %1          \n"

+    "sub       $0x8, %2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_a),     // %1

+    "+rm"(width)     // %2

+  :

+  : "memory", "cc"

+    , "xmm0", "xmm1"

+  );

+}

+#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2

 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2

 // width in pixels

 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

@@ -2786,7 +3035,7 @@

 #ifdef HAS_SETROW_X86

 void SetRow_X86(uint8* dst, uint8 v8, int width) {

   size_t width_tmp = (size_t)(width >> 2);

-  const uint32 v32 = v8 * 0x01010101;  // Duplicate byte to all bytes.

+  const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.

   asm volatile (

     "rep stosl " MEMSTORESTRING(eax,0) "       \n"

     : "+D"(dst),       // %0

@@ -2817,7 +3066,7 @@

 #endif  // HAS_SETROW_X86

 #ifdef HAS_YUY2TOYROW_SSE2

-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {

+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {

   asm volatile (

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     "psrlw     $0x8,%%xmm5                     \n"

@@ -2835,7 +3084,7 @@

     "jg        1b                              \n"

   : "+r"(src_yuy2),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "memory", "cc"

     , "xmm0", "xmm1", "xmm5"

@@ -2843,7 +3092,7 @@

 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     "psrlw     $0x8,%%xmm5                     \n"

@@ -2873,7 +3122,7 @@

   : "+r"(src_yuy2),    // %0

     "+r"(dst_u),       // %1

     "+r"(dst_v),       // %2

-    "+r"(pix)          // %3

+    "+r"(width)          // %3

   : "r"((intptr_t)(stride_yuy2))  // %4

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

@@ -2881,7 +3130,7 @@

 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int pix) {

+                         uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     "psrlw     $0x8,%%xmm5                     \n"

@@ -2907,7 +3156,7 @@

   : "+r"(src_yuy2),    // %0

     "+r"(dst_u),       // %1

     "+r"(dst_v),       // %2

-    "+r"(pix)          // %3

+    "+r"(width)          // %3

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm5"

@@ -2914,7 +3163,7 @@

);

-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {

+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {

   asm volatile (

     LABELALIGN

   "1:                                          \n"

@@ -2930,7 +3179,7 @@

     "jg        1b                              \n"

   : "+r"(src_uyvy),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "memory", "cc"

     , "xmm0", "xmm1"

@@ -2938,7 +3187,7 @@

 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     "psrlw     $0x8,%%xmm5                     \n"

@@ -2968,7 +3217,7 @@

   : "+r"(src_uyvy),    // %0

     "+r"(dst_u),       // %1

     "+r"(dst_v),       // %2

-    "+r"(pix)          // %3

+    "+r"(width)          // %3

   : "r"((intptr_t)(stride_uyvy))  // %4

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

@@ -2976,7 +3225,7 @@

 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int pix) {

+                         uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     "psrlw     $0x8,%%xmm5                     \n"

@@ -3002,7 +3251,7 @@

   : "+r"(src_uyvy),    // %0

     "+r"(dst_u),       // %1

     "+r"(dst_v),       // %2

-    "+r"(pix)          // %3

+    "+r"(width)          // %3

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm5"

@@ -3011,7 +3260,7 @@

 #endif  // HAS_YUY2TOYROW_SSE2

 #ifdef HAS_YUY2TOYROW_AVX2

-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {

+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {

   asm volatile (

     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

@@ -3031,7 +3280,7 @@

     "vzeroupper                                \n"

   : "+r"(src_yuy2),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "memory", "cc"

     , "xmm0", "xmm1", "xmm5"

@@ -3039,7 +3288,7 @@

 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

@@ -3070,7 +3319,7 @@

   : "+r"(src_yuy2),    // %0

     "+r"(dst_u),       // %1

     "+r"(dst_v),       // %2

-    "+r"(pix)          // %3

+    "+r"(width)          // %3

   : "r"((intptr_t)(stride_yuy2))  // %4

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm5"

@@ -3078,7 +3327,7 @@

 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int pix) {

+                         uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

@@ -3107,7 +3356,7 @@

   : "+r"(src_yuy2),    // %0

     "+r"(dst_u),       // %1

     "+r"(dst_v),       // %2

-    "+r"(pix)          // %3

+    "+r"(width)          // %3

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm5"

@@ -3114,7 +3363,7 @@

);

-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {

+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {

   asm volatile (

     LABELALIGN

   "1:                                          \n"

@@ -3132,7 +3381,7 @@

     "vzeroupper                                \n"

   : "+r"(src_uyvy),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "memory", "cc"

     , "xmm0", "xmm1", "xmm5"

@@ -3139,7 +3388,7 @@

);

 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

@@ -3171,7 +3420,7 @@

   : "+r"(src_uyvy),    // %0

     "+r"(dst_u),       // %1

     "+r"(dst_v),       // %2

-    "+r"(pix)          // %3

+    "+r"(width)          // %3

   : "r"((intptr_t)(stride_uyvy))  // %4

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm5"

@@ -3179,7 +3428,7 @@

 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int pix) {

+                         uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"

@@ -3208,7 +3457,7 @@

   : "+r"(src_uyvy),    // %0

     "+r"(dst_u),       // %1

     "+r"(dst_v),       // %2

-    "+r"(pix)          // %3

+    "+r"(width)          // %3

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm5"

@@ -3216,92 +3465,6 @@

 #endif  // HAS_YUY2TOYROW_AVX2

-#ifdef HAS_ARGBBLENDROW_SSE2

-// Blend 8 pixels at a time.

-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

-                       uint8* dst_argb, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm7,%%xmm7                   \n"

-    "psrlw     $0xf,%%xmm7                     \n"

-    "pcmpeqb   %%xmm6,%%xmm6                   \n"

-    "psrlw     $0x8,%%xmm6                     \n"

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psllw     $0x8,%%xmm5                     \n"

-    "pcmpeqb   %%xmm4,%%xmm4                   \n"

-    "pslld     $0x18,%%xmm4                    \n"

-    "sub       $0x4,%3                         \n"

-    "jl        49f                             \n"

-    // 4 pixel loop.

-    LABELALIGN

-  "41:                                         \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqa    %%xmm3,%%xmm0                   \n"

-    "pxor      %%xmm4,%%xmm3                   \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"

-    "psrlw     $0x8,%%xmm3                     \n"

-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"

-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"

-    "pand      %%xmm6,%%xmm2                   \n"

-    "paddw     %%xmm7,%%xmm3                   \n"

-    "pmullw    %%xmm3,%%xmm2                   \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "por       %%xmm4,%%xmm0                   \n"

-    "pmullw    %%xmm3,%%xmm1                   \n"

-    "psrlw     $0x8,%%xmm2                     \n"

-    "paddusb   %%xmm2,%%xmm0                   \n"

-    "pand      %%xmm5,%%xmm1                   \n"

-    "paddusb   %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%3                         \n"

-    "jge       41b                             \n"

-  "49:                                         \n"

-    "add       $0x3,%3                         \n"

-    "jl        99f                             \n"

-    // 1 pixel loop.

-  "91:                                         \n"

-    "movd      " MEMACCESS(0) ",%%xmm3         \n"

-    "lea       " MEMLEA(0x4,0) ",%0            \n"

-    "movdqa    %%xmm3,%%xmm0                   \n"

-    "pxor      %%xmm4,%%xmm3                   \n"

-    "movd      " MEMACCESS(1) ",%%xmm2         \n"

-    "psrlw     $0x8,%%xmm3                     \n"

-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"

-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"

-    "pand      %%xmm6,%%xmm2                   \n"

-    "paddw     %%xmm7,%%xmm3                   \n"

-    "pmullw    %%xmm3,%%xmm2                   \n"

-    "movd      " MEMACCESS(1) ",%%xmm1         \n"

-    "lea       " MEMLEA(0x4,1) ",%1            \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "por       %%xmm4,%%xmm0                   \n"

-    "pmullw    %%xmm3,%%xmm1                   \n"

-    "psrlw     $0x8,%%xmm2                     \n"

-    "paddusb   %%xmm2,%%xmm0                   \n"

-    "pand      %%xmm5,%%xmm1                   \n"

-    "paddusb   %%xmm1,%%xmm0                   \n"

-    "movd      %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x4,2) ",%2            \n"

-    "sub       $0x1,%3                         \n"

-    "jge       91b                             \n"

-  "99:                                         \n"

-  : "+r"(src_argb0),    // %0

-    "+r"(src_argb1),    // %1

-    "+r"(dst_argb),     // %2

-    "+r"(width)         // %3

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

-}

-#endif  // HAS_ARGBBLENDROW_SSE2

 #ifdef HAS_ARGBBLENDROW_SSSE3

 // Shuffle table for isolating alpha.

 static uvec8 kShuffleAlpha = {

@@ -3310,15 +3473,6 @@

};

 // Blend 8 pixels at a time

-// Shuffle table for reversing the bytes.

-// Same as SSE2, but replaces

-//    psrlw      xmm3, 8          // alpha

-//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words

-//    pshuflw    xmm3, xmm3,0F5h

-// with..

-//    pshufb     xmm3, kShuffleAlpha // alpha

 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,

                         uint8* dst_argb, int width) {

   asm volatile (

@@ -3399,50 +3553,113 @@

 #endif  // HAS_ARGBBLENDROW_SSSE3

-#ifdef HAS_ARGBATTENUATEROW_SSE2

-// Attenuate 4 pixels at a time.

-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {

+#ifdef HAS_BLENDPLANEROW_SSSE3

+// Blend 8 pixels at a time.

+// unsigned version of math

+// =((A2*C2)+(B2*(255-C2))+255)/256

+// signed version of math

+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256

+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,

+                         const uint8* alpha, uint8* dst, int width) {

   asm volatile (

-    "pcmpeqb   %%xmm4,%%xmm4                   \n"

-    "pslld     $0x18,%%xmm4                    \n"

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psrld     $0x8,%%xmm5                     \n"

+    "pcmpeqb    %%xmm5,%%xmm5                  \n"

+    "psllw      $0x8,%%xmm5                    \n"

+    "mov        $0x80808080,%%eax              \n"

+    "movd       %%eax,%%xmm6                   \n"

+    "pshufd     $0x0,%%xmm6,%%xmm6             \n"

+    "mov        $0x807f807f,%%eax              \n"

+    "movd       %%eax,%%xmm7                   \n"

+    "pshufd     $0x0,%%xmm7,%%xmm7             \n"

+    "sub        %2,%0                          \n"

+    "sub        %2,%1                          \n"

+    "sub        %2,%3                          \n"

-    // 4 pixel loop.

+    // 8 pixel loop.

     LABELALIGN

   "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "punpcklbw %%xmm0,%%xmm0                   \n"

-    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"

-    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"

-    "pmulhuw   %%xmm2,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

-    "punpckhbw %%xmm1,%%xmm1                   \n"

-    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"

-    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"

-    "pmulhuw   %%xmm2,%%xmm1                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "pand      %%xmm4,%%xmm2                   \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "pand      %%xmm5,%%xmm0                   \n"

-    "por       %%xmm2,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%2                         \n"

+    "movq       (%2),%%xmm0                    \n"

+    "punpcklbw  %%xmm0,%%xmm0                  \n"

+    "pxor       %%xmm5,%%xmm0                  \n"

+    "movq       (%0,%2,1),%%xmm1               \n"

+    "movq       (%1,%2,1),%%xmm2               \n"

+    "punpcklbw  %%xmm2,%%xmm1                  \n"

+    "psubb      %%xmm6,%%xmm1                  \n"

+    "pmaddubsw  %%xmm1,%%xmm0                  \n"

+    "paddw      %%xmm7,%%xmm0                  \n"

+    "psrlw      $0x8,%%xmm0                    \n"

+    "packuswb   %%xmm0,%%xmm0                  \n"

+    "movq       %%xmm0,(%3,%2,1)               \n"

+    "lea        0x8(%2),%2                     \n"

+    "sub        $0x8,%4                        \n"

     "jg        1b                              \n"

-  : "+r"(src_argb),    // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width)        // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  : "+r"(src0),       // %0

+    "+r"(src1),       // %1

+    "+r"(alpha),      // %2

+    "+r"(dst),        // %3

+    "+rm"(width)      // %4

+  :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"

);

-#endif  // HAS_ARGBATTENUATEROW_SSE2

+#endif  // HAS_BLENDPLANEROW_SSSE3

+#ifdef HAS_BLENDPLANEROW_AVX2

+// Blend 32 pixels at a time.

+// unsigned version of math

+// =((A2*C2)+(B2*(255-C2))+255)/256

+// signed version of math

+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256

+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,

+                        const uint8* alpha, uint8* dst, int width) {

+  asm volatile (

+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+    "vpsllw     $0x8,%%ymm5,%%ymm5             \n"

+    "mov        $0x80808080,%%eax              \n"

+    "vmovd      %%eax,%%xmm6                   \n"

+    "vbroadcastss %%xmm6,%%ymm6                \n"

+    "mov        $0x807f807f,%%eax              \n"

+    "vmovd      %%eax,%%xmm7                   \n"

+    "vbroadcastss %%xmm7,%%ymm7                \n"

+    "sub        %2,%0                          \n"

+    "sub        %2,%1                          \n"

+    "sub        %2,%3                          \n"

+    // 32 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu    (%2),%%ymm0                    \n"

+    "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"

+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"

+    "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"

+    "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"

+    "vmovdqu    (%0,%2,1),%%ymm1               \n"

+    "vmovdqu    (%1,%2,1),%%ymm2               \n"

+    "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"

+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"

+    "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"

+    "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"

+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+    "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"

+    "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"

+    "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"

+    "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"

+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

+    "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"

+    "vmovdqu    %%ymm0,(%3,%2,1)               \n"

+    "lea        0x20(%2),%2                    \n"

+    "sub        $0x20,%4                       \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : "+r"(src0),       // %0

+    "+r"(src1),       // %1

+    "+r"(alpha),      // %2

+    "+r"(dst),        // %3

+    "+rm"(width)      // %4

+  :: "memory", "cc", "eax",

+     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+  );

+}

+#endif  // HAS_BLENDPLANEROW_AVX2

 #ifdef HAS_ARGBATTENUATEROW_SSSE3

 // Shuffle table duplicating alpha

 static uvec8 kShuffleAlpha0 = {

@@ -3542,7 +3759,7 @@

 // Unattenuate 4 pixels at a time.

 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,

                              int width) {

-  uintptr_t alpha = 0;

+  uintptr_t alpha;

   asm volatile (

     // 4 pixel loop.

     LABELALIGN

@@ -3573,10 +3790,10 @@

     "lea       " MEMLEA(0x10,1) ",%1           \n"

     "sub       $0x4,%2                         \n"

     "jg        1b                              \n"

-  : "+r"(src_argb),    // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width),       // %2

-    "+r"(alpha)        // %3

+  : "+r"(src_argb),     // %0

+    "+r"(dst_argb),     // %1

+    "+r"(width),        // %2

+    "=&r"(alpha)        // %3

   : "r"(fixed_invtbl8)  // %4

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

@@ -3592,7 +3809,7 @@

 // Unattenuate 8 pixels at a time.

 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,

                              int width) {

-  uintptr_t alpha = 0;

+  uintptr_t alpha;

   asm volatile (

     "sub        %0,%1                          \n"

     "vbroadcastf128 %5,%%ymm5                  \n"

@@ -3641,10 +3858,10 @@

     "sub        $0x8,%2                        \n"

     "jg        1b                              \n"

     "vzeroupper                                \n"

-  : "+r"(src_argb),    // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width),       // %2

-    "+r"(alpha)        // %3

+  : "+r"(src_argb),      // %0

+    "+r"(dst_argb),      // %1

+    "+r"(width),         // %2

+    "=&r"(alpha)         // %3

   : "r"(fixed_invtbl8),  // %4

     "m"(kUnattenShuffleAlpha_AVX2)  // %5

   : "memory", "cc", NACL_R14

@@ -4569,7 +4786,7 @@

 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

                         uint8* dst_argb, const float* src_dudv, int width) {

   intptr_t src_argb_stride_temp = src_argb_stride;

-  intptr_t temp = 0;

+  intptr_t temp;

   asm volatile (

     "movq      " MEMACCESS(3) ",%%xmm2         \n"

     "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"

@@ -4641,7 +4858,7 @@

     "+r"(dst_argb),  // %2

     "+r"(src_dudv),  // %3

     "+rm"(width),    // %4

-    "+r"(temp)   // %5

+    "=&r"(temp)      // %5

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

@@ -4656,23 +4873,21 @@

                           int source_y_fraction) {

   asm volatile (

     "sub       %1,%0                           \n"

-    "shr       %3                              \n"

     "cmp       $0x0,%3                         \n"

     "je        100f                            \n"

-    "cmp       $0x20,%3                        \n"

-    "je        75f                             \n"

-    "cmp       $0x40,%3                        \n"

+    "cmp       $0x80,%3                        \n"

     "je        50f                             \n"

-    "cmp       $0x60,%3                        \n"

-    "je        25f                             \n"

     "movd      %3,%%xmm0                       \n"

     "neg       %3                              \n"

-    "add       $0x80,%3                        \n"

+    "add       $0x100,%3                       \n"

     "movd      %3,%%xmm5                       \n"

     "punpcklbw %%xmm0,%%xmm5                   \n"

     "punpcklwd %%xmm5,%%xmm5                   \n"

     "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+    "mov       $0x80808080,%%eax               \n"

+    "movd      %%eax,%%xmm4                    \n"

+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"

     // General purpose row blend.

     LABELALIGN

@@ -4679,33 +4894,26 @@

   "1:                                          \n"

     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

     MEMOPREG(movdqu,0x00,1,4,1,xmm2)

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklbw %%xmm2,%%xmm0                   \n"

-    "punpckhbw %%xmm2,%%xmm1                   \n"

-    "pmaddubsw %%xmm5,%%xmm0                   \n"

-    "pmaddubsw %%xmm5,%%xmm1                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "psrlw     $0x7,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)

+    "movdqa     %%xmm0,%%xmm1                  \n"

+    "punpcklbw  %%xmm2,%%xmm0                  \n"

+    "punpckhbw  %%xmm2,%%xmm1                  \n"

+    "psubb      %%xmm4,%%xmm0                  \n"

+    "psubb      %%xmm4,%%xmm1                  \n"

+    "movdqa     %%xmm5,%%xmm2                  \n"

+    "movdqa     %%xmm5,%%xmm3                  \n"

+    "pmaddubsw  %%xmm0,%%xmm2                  \n"

+    "pmaddubsw  %%xmm1,%%xmm3                  \n"

+    "paddw      %%xmm4,%%xmm2                  \n"

+    "paddw      %%xmm4,%%xmm3                  \n"

+    "psrlw      $0x8,%%xmm2                    \n"

+    "psrlw      $0x8,%%xmm3                    \n"

+    "packuswb   %%xmm3,%%xmm2                  \n"

+    MEMOPMEM(movdqu,xmm2,0x00,1,0,1)

     "lea       " MEMLEA(0x10,1) ",%1           \n"

     "sub       $0x10,%2                        \n"

     "jg        1b                              \n"

     "jmp       99f                             \n"

-    // Blend 25 / 75.

-    LABELALIGN

-  "25:                                         \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        25b                             \n"

-    "jmp       99f                             \n"

     // Blend 50 / 50.

     LABELALIGN

   "50:                                         \n"

@@ -4718,19 +4926,6 @@

     "jg        50b                             \n"

     "jmp       99f                             \n"

-    // Blend 75 / 25.

-    LABELALIGN

-  "75:                                         \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

-    MEMOPREG(movdqu,0x00,1,4,1,xmm0)

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        75b                             \n"

-    "jmp       99f                             \n"

     // Blend 100 / 0 - Copy row unchanged.

     LABELALIGN

   "100:                                        \n"

@@ -4741,13 +4936,13 @@

     "jg        100b                            \n"

   "99:                                         \n"

-  : "+r"(dst_ptr),    // %0

-    "+r"(src_ptr),    // %1

-    "+r"(dst_width),  // %2

+  : "+r"(dst_ptr),     // %0

+    "+r"(src_ptr),     // %1

+    "+rm"(dst_width),  // %2

     "+r"(source_y_fraction)  // %3

   : "r"((intptr_t)(src_stride))  // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm5"

+  : "memory", "cc", "eax", NACL_R14

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

 #endif  // HAS_INTERPOLATEROW_SSSE3

@@ -4758,25 +4953,22 @@

                          ptrdiff_t src_stride, int dst_width,

                          int source_y_fraction) {

   asm volatile (

-    "shr       %3                              \n"

     "cmp       $0x0,%3                         \n"

     "je        100f                            \n"

     "sub       %1,%0                           \n"

-    "cmp       $0x20,%3                        \n"

-    "je        75f                             \n"

-    "cmp       $0x40,%3                        \n"

+    "cmp       $0x80,%3                        \n"

     "je        50f                             \n"

-    "cmp       $0x60,%3                        \n"

-    "je        25f                             \n"

     "vmovd      %3,%%xmm0                      \n"

     "neg        %3                             \n"

-    "add        $0x80,%3                       \n"

+    "add        $0x100,%3                      \n"

     "vmovd      %3,%%xmm5                      \n"

     "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"

     "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"

-    "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"

-    "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"

+    "vbroadcastss %%xmm5,%%ymm5                \n"

+    "mov        $0x80808080,%%eax              \n"

+    "vmovd      %%eax,%%xmm4                   \n"

+    "vbroadcastss %%xmm4,%%ymm4                \n"

     // General purpose row blend.

     LABELALIGN

@@ -4785,10 +4977,14 @@

     MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)

     "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"

     "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"

-    "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"

-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"

-    "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"

+    "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"

+    "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"

+    "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"

+    "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"

+    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"

+    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"

+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"

+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)

     "lea       " MEMLEA(0x20,1) ",%1           \n"

@@ -4796,19 +4992,6 @@

     "jg        1b                              \n"

     "jmp       99f                             \n"

-    // Blend 25 / 75.

-    LABELALIGN

-  "25:                                         \n"

-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"

-    MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)

-    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"

-    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"

-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x20,%2                        \n"

-    "jg        25b                             \n"

-    "jmp       99f                             \n"

     // Blend 50 / 50.

     LABELALIGN

   "50:                                         \n"

@@ -4820,19 +5003,6 @@

     "jg        50b                             \n"

     "jmp       99f                             \n"

-    // Blend 75 / 25.

-    LABELALIGN

-  "75:                                         \n"

-    "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"

-    MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)

-    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"

-    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"

-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x20,%2                        \n"

-    "jg        75b                             \n"

-    "jmp       99f                             \n"

     // Blend 100 / 0 - Copy row unchanged.

     LABELALIGN

   "100:                                        \n"

@@ -4844,130 +5014,19 @@

   "999:                                        \n"

   : "+D"(dst_ptr),    // %0

     "+S"(src_ptr),    // %1

-    "+c"(dst_width),  // %2

+    "+cm"(dst_width),  // %2

     "+r"(source_y_fraction)  // %3

   : "r"((intptr_t)(src_stride))  // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm5"

+  : "memory", "cc", "eax", NACL_R14

+    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"

);

 #endif  // HAS_INTERPOLATEROW_AVX2

-#ifdef HAS_INTERPOLATEROW_SSE2

-// Bilinear filter 16x2 -> 16x1

-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,

-                         ptrdiff_t src_stride, int dst_width,

-                         int source_y_fraction) {

-  asm volatile (

-    "sub       %1,%0                           \n"

-    "shr       %3                              \n"

-    "cmp       $0x0,%3                         \n"

-    "je        100f                            \n"

-    "cmp       $0x20,%3                        \n"

-    "je        75f                             \n"

-    "cmp       $0x40,%3                        \n"

-    "je        50f                             \n"

-    "cmp       $0x60,%3                        \n"

-    "je        25f                             \n"

-    "movd      %3,%%xmm0                       \n"

-    "neg       %3                              \n"

-    "add       $0x80,%3                        \n"

-    "movd      %3,%%xmm5                       \n"

-    "punpcklbw %%xmm0,%%xmm5                   \n"

-    "punpcklwd %%xmm5,%%xmm5                   \n"

-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

-    "pxor      %%xmm4,%%xmm4                   \n"

-    // General purpose row blend.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm2,%%xmm3                   \n"

-    "punpcklbw %%xmm4,%%xmm2                   \n"

-    "punpckhbw %%xmm4,%%xmm3                   \n"

-    "punpcklbw %%xmm4,%%xmm0                   \n"

-    "punpckhbw %%xmm4,%%xmm1                   \n"

-    "psubw     %%xmm0,%%xmm2                   \n"

-    "psubw     %%xmm1,%%xmm3                   \n"

-    "paddw     %%xmm2,%%xmm2                   \n"

-    "paddw     %%xmm3,%%xmm3                   \n"

-    "pmulhw    %%xmm5,%%xmm2                   \n"

-    "pmulhw    %%xmm5,%%xmm3                   \n"

-    "paddw     %%xmm2,%%xmm0                   \n"

-    "paddw     %%xmm3,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-    "jmp       99f                             \n"

-    // Blend 25 / 75.

-    LABELALIGN

-  "25:                                         \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        25b                             \n"

-    "jmp       99f                             \n"

-    // Blend 50 / 50.

-    LABELALIGN

-  "50:                                         \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        50b                             \n"

-    "jmp       99f                             \n"

-    // Blend 75 / 25.

-    LABELALIGN

-  "75:                                         \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

-    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        75b                             \n"

-    "jmp       99f                             \n"

-    // Blend 100 / 0 - Copy row unchanged.

-    LABELALIGN

-  "100:                                        \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        100b                            \n"

-  "99:                                         \n"

-  : "+r"(dst_ptr),    // %0

-    "+r"(src_ptr),    // %1

-    "+r"(dst_width),  // %2

-    "+r"(source_y_fraction)  // %3

-  : "r"((intptr_t)(src_stride))  // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

-}

-#endif  // HAS_INTERPOLATEROW_SSE2

 #ifdef HAS_ARGBSHUFFLEROW_SSSE3

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                          const uint8* shuffler, int pix) {

+                          const uint8* shuffler, int width) {

   asm volatile (

     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"

     LABELALIGN

@@ -4984,7 +5043,7 @@

     "jg        1b                              \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_argb),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "r"(shuffler)    // %3

   : "memory", "cc"

     , "xmm0", "xmm1", "xmm5"

@@ -4995,7 +5054,7 @@

 #ifdef HAS_ARGBSHUFFLEROW_AVX2

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int pix) {

+                         const uint8* shuffler, int width) {

   asm volatile (

     "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"

     LABELALIGN

@@ -5013,7 +5072,7 @@

     "vzeroupper                                \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_argb),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "r"(shuffler)    // %3

   : "memory", "cc"

     , "xmm0", "xmm1", "xmm5"

@@ -5024,8 +5083,8 @@

 #ifdef HAS_ARGBSHUFFLEROW_SSE2

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int pix) {

-  uintptr_t pixel_temp = 0u;

+                         const uint8* shuffler, int width) {

+  uintptr_t pixel_temp;

   asm volatile (

     "pxor      %%xmm5,%%xmm5                   \n"

     "mov       " MEMACCESS(4) ",%k2            \n"

@@ -5130,11 +5189,11 @@

     "jg        3012b                           \n"

   "99:                                         \n"

-  : "+r"(src_argb),    // %0

-    "+r"(dst_argb),    // %1

-    "+d"(pixel_temp),  // %2

-    "+r"(pix)         // %3

-  : "r"(shuffler)      // %4

+  : "+r"(src_argb),     // %0

+    "+r"(dst_argb),     // %1

+    "=&d"(pixel_temp),  // %2

+    "+r"(width)         // %3

+  : "r"(shuffler)       // %4

   : "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm5"

);

@@ -5311,7 +5370,7 @@

 // Tranform ARGB pixels with color table.

 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,

                            int width) {

-  uintptr_t pixel_temp = 0u;

+  uintptr_t pixel_temp;

   asm volatile (

     // 1 pixel loop.

     LABELALIGN

@@ -5331,10 +5390,10 @@

     "mov       %b1," MEMACCESS2(-0x1,0) "      \n"

     "dec       %2                              \n"

     "jg        1b                              \n"

-  : "+r"(dst_argb),   // %0

-    "+d"(pixel_temp), // %1

-    "+r"(width)       // %2

-  : "r"(table_argb)   // %3

+  : "+r"(dst_argb),     // %0

+    "=&d"(pixel_temp),  // %1

+    "+r"(width)         // %2

+  : "r"(table_argb)     // %3

   : "memory", "cc");

 #endif  // HAS_ARGBCOLORTABLEROW_X86

@@ -5342,7 +5401,7 @@

 #ifdef HAS_RGBCOLORTABLEROW_X86

 // Tranform RGB pixels with color table.

 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {

-  uintptr_t pixel_temp = 0u;

+  uintptr_t pixel_temp;

   asm volatile (

     // 1 pixel loop.

     LABELALIGN

@@ -5359,10 +5418,10 @@

     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"

     "dec       %2                              \n"

     "jg        1b                              \n"

-  : "+r"(dst_argb),   // %0

-    "+d"(pixel_temp), // %1

-    "+r"(width)       // %2

-  : "r"(table_argb)   // %3

+  : "+r"(dst_argb),     // %0

+    "=&d"(pixel_temp),  // %1

+    "+r"(width)         // %2

+  : "r"(table_argb)     // %3

   : "memory", "cc");

 #endif  // HAS_RGBCOLORTABLEROW_X86

@@ -5372,8 +5431,8 @@

 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

                                  int width,

                                  const uint8* luma, uint32 lumacoeff) {

-  uintptr_t pixel_temp = 0u;

-  uintptr_t table_temp = 0u;

+  uintptr_t pixel_temp;

+  uintptr_t table_temp;

   asm volatile (

     "movd      %6,%%xmm3                       \n"

     "pshufd    $0x0,%%xmm3,%%xmm3              \n"

@@ -5455,13 +5514,13 @@

     "lea       " MEMLEA(0x10,3) ",%3           \n"

     "sub       $0x4,%4                         \n"

     "jg        1b                              \n"

-  : "+d"(pixel_temp),  // %0

-    "+a"(table_temp),  // %1

-    "+r"(src_argb),    // %2

-    "+r"(dst_argb),    // %3

-    "+rm"(width)       // %4

-  : "r"(luma),         // %5

-    "rm"(lumacoeff)    // %6

+  : "=&d"(pixel_temp),  // %0

+    "=&a"(table_temp),  // %1

+    "+r"(src_argb),     // %2

+    "+r"(dst_argb),     // %3

+    "+rm"(width)        // %4

+  : "r"(luma),          // %5

+    "rm"(lumacoeff)     // %6

   : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"

);

--- a/third_party/libyuv/source/row_mips.cc

+++ b/third_party/libyuv/source/row_mips.cc

@@ -375,13 +375,13 @@

 #endif  // HAS_COPYROW_MIPS

-// MIPS DSPR2 functions

+// DSPR2 functions

 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \

     (__mips_dsp_rev >= 2) && \

     (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)

-void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                           int width) {

+void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                      int width) {

   __asm__ __volatile__ (

     ".set push                                     \n"

     ".set noreorder                                \n"

@@ -389,7 +389,6 @@

     "blez            $t4, 2f                       \n"

     " andi           %[width], %[width], 0xf       \n"  // residual

-    ".p2align        2                             \n"

   "1:                                              \n"

     "addiu           $t4, $t4, -1                  \n"

     "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0

@@ -447,7 +446,7 @@

);

-void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {

+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {

   __asm__ __volatile__ (

     ".set push                             \n"

     ".set noreorder                        \n"

@@ -457,7 +456,6 @@

     "blez      $t4, 2f                     \n"

     " addu     %[src], %[src], %[width]    \n"  // src += width

-    ".p2align  2                           \n"

    "1:                                     \n"

     "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|

     "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|

@@ -498,10 +496,10 @@

);

-void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                            int width) {

-  int x = 0;

-  int y = 0;

+void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                       int width) {

+  int x;

+  int y;

   __asm__ __volatile__ (

     ".set push                                    \n"

     ".set noreorder                               \n"

@@ -512,7 +510,6 @@

     "blez            %[x], 2f                     \n"

     " addu           %[src_uv], %[src_uv], $t4    \n"

-    ".p2align        2                            \n"

    "1:                                            \n"

     "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|

     "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|

@@ -582,7 +579,7 @@

         [dst_u] "+r" (dst_u),

         [dst_v] "+r" (dst_v),

         [x] "=&r" (x),

-        [y] "+r" (y)

+        [y] "=&r" (y)

       : [width] "r" (width)

       : "t0", "t1", "t2", "t3", "t4",

       "t5", "t7", "t8", "t9"

@@ -596,7 +593,7 @@

 // t8 = | 0 | G1 | 0 | g1 |

 // t2 = | 0 | R0 | 0 | r0 |

 // t1 = | 0 | R1 | 0 | r1 |

-#define I422ToTransientMipsRGB                                                 \

+#define YUVTORGB                                                               \

       "lw                $t0, 0(%[y_buf])       \n"                            \

       "lhu               $t1, 0(%[u_buf])       \n"                            \

       "lhu               $t2, 0(%[v_buf])       \n"                            \

@@ -655,11 +652,13 @@

       "addu.ph           $t2, $t2, $s5          \n"                            \

       "addu.ph           $t1, $t1, $s5          \n"

-void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,

-                              const uint8* u_buf,

-                              const uint8* v_buf,

-                              uint8* rgb_buf,

-                              int width) {

+// TODO(fbarchard): accept yuv conversion constants.

+void I422ToARGBRow_DSPR2(const uint8* y_buf,

+                         const uint8* u_buf,

+                         const uint8* v_buf,

+                         uint8* rgb_buf,

+                         const struct YuvConstants* yuvconstants,

+                         int width) {

   __asm__ __volatile__ (

     ".set push                                \n"

     ".set noreorder                           \n"

@@ -673,9 +672,8 @@

     "lui               $s6, 0xff00            \n"

     "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|

-    ".p2align          2                      \n"

    "1:                                        \n"

-      I422ToTransientMipsRGB

+      YUVTORGB

 // Arranging into argb format

     "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|

     "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|

@@ -717,136 +715,10 @@

);

-void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,

-                              const uint8* u_buf,

-                              const uint8* v_buf,

-                              uint8* rgb_buf,

-                              int width) {

-  __asm__ __volatile__ (

-    ".set push                                \n"

-    ".set noreorder                           \n"

-    "beqz              %[width], 2f           \n"

-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|

-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|

-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|

-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|

-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|

-    "repl.ph           $s5, 128               \n"  // |128|128|

-    "lui               $s6, 0xff00            \n"

-    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|

-    ".p2align          2                       \n"

-   "1:                                         \n"

-      I422ToTransientMipsRGB

-// Arranging into abgr format

-    "precr.qb.ph      $t0, $t8, $t1           \n"  // |G1|g1|R1|r1|

-    "precr.qb.ph      $t3, $t9, $t2           \n"  // |G0|g0|R0|r0|

-    "precrq.qb.ph     $t8, $t0, $t3           \n"  // |G1|R1|G0|R0|

-    "precr.qb.ph      $t9, $t0, $t3           \n"  // |g1|r1|g0|r0|

-    "precr.qb.ph       $t2, $t4, $t5          \n"  // |B1|b1|B0|b0|

-    "addiu             %[width], -4           \n"

-    "addiu             %[y_buf], 4            \n"

-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |B1|0 |B0|

-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |b1|0 |b0|

-    "or                $t1, $t1, $s6          \n"  // |ff|B1|ff|B0|

-    "or                $t2, $t2, $s6          \n"  // |ff|b1|ff|b0|

-    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|b1|g1|r1|

-    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|B1|G1|R1|

-    "sll               $t9, $t9, 16           \n"

-    "sll               $t8, $t8, 16           \n"

-    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|b0|g0|r0|

-    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|B0|G0|R0|

-// Store results.

-    "sw                $t2, 0(%[rgb_buf])     \n"

-    "sw                $t0, 4(%[rgb_buf])     \n"

-    "sw                $t1, 8(%[rgb_buf])     \n"

-    "sw                $t3, 12(%[rgb_buf])    \n"

-    "bnez              %[width], 1b           \n"

-    " addiu            %[rgb_buf], 16         \n"

-   "2:                                        \n"

-    ".set pop                                 \n"

-      :[y_buf] "+r" (y_buf),

-       [u_buf] "+r" (u_buf),

-       [v_buf] "+r" (v_buf),

-       [width] "+r" (width),

-       [rgb_buf] "+r" (rgb_buf)

-      :

-      : "t0", "t1",  "t2", "t3",  "t4", "t5",

-      "t6", "t7", "t8", "t9",

-      "s0", "s1", "s2", "s3",

-      "s4", "s5", "s6"

-  );

-}

-void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,

-                              const uint8* u_buf,

-                              const uint8* v_buf,

-                              uint8* rgb_buf,

-                              int width) {

-  __asm__ __volatile__ (

-    ".set push                                \n"

-    ".set noreorder                           \n"

-    "beqz              %[width], 2f           \n"

-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |

-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|

-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|

-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|

-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|

-    "repl.ph           $s5, 128               \n"  // |128|128|

-    "lui               $s6, 0xff              \n"

-    "ori               $s6, 0xff              \n"  // |00|ff|00|ff|

-    ".p2align          2                      \n"

-   "1:                                        \n"

-      I422ToTransientMipsRGB

-      // Arranging into bgra format

-    "precr.qb.ph       $t4, $t4, $t8          \n"  // |B1|b1|G1|g1|

-    "precr.qb.ph       $t5, $t5, $t9          \n"  // |B0|b0|G0|g0|

-    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |B1|G1|B0|G0|

-    "precr.qb.ph       $t9, $t4, $t5          \n"  // |b1|g1|b0|g0|

-    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|

-    "addiu             %[width], -4           \n"

-    "addiu             %[y_buf], 4            \n"

-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|

-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|

-    "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |

-    "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |

-    "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|

-    "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|

-    "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|

-    "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|

-    "sll               $t1, $t1, 16           \n"

-    "sll               $t2, $t2, 16           \n"

-    "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|

-    "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|

-// Store results.

-    "sw                $t2, 0(%[rgb_buf])     \n"

-    "sw                $t0, 4(%[rgb_buf])     \n"

-    "sw                $t1, 8(%[rgb_buf])     \n"

-    "sw                $t3, 12(%[rgb_buf])    \n"

-    "bnez              %[width], 1b           \n"

-    " addiu            %[rgb_buf], 16         \n"

-   "2:                                        \n"

-    ".set pop                                 \n"

-      :[y_buf] "+r" (y_buf),

-       [u_buf] "+r" (u_buf),

-       [v_buf] "+r" (v_buf),

-       [width] "+r" (width),

-       [rgb_buf] "+r" (rgb_buf)

-      :

-      : "t0", "t1",  "t2", "t3",  "t4", "t5",

-      "t6", "t7", "t8", "t9",

-      "s0", "s1", "s2", "s3",

-      "s4", "s5", "s6"

-  );

-}

 // Bilinear filter 8x2 -> 8x1

-void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

-                               ptrdiff_t src_stride, int dst_width,

-                               int source_y_fraction) {

+void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

+                          ptrdiff_t src_stride, int dst_width,

+                          int source_y_fraction) {

     int y0_fraction = 256 - source_y_fraction;

     const uint8* src_ptr1 = src_ptr + src_stride;

@@ -857,7 +729,6 @@

      "replv.ph          $t0, %[y0_fraction]               \n"

      "replv.ph          $t1, %[source_y_fraction]         \n"

-    ".p2align           2                                 \n"

    "1:                                                    \n"

      "lw                $t2, 0(%[src_ptr])                \n"

      "lw                $t3, 0(%[src_ptr1])               \n"

--- a/third_party/libyuv/source/row_neon.cc

+++ b/third_party/libyuv/source/row_neon.cc

@@ -93,7 +93,7 @@

     "vuzp.u8    d2, d3                         \n"                             \

     "vtrn.u32   d2, d3                         \n"

-#define YUV422TORGB_SETUP_REG                                                  \

+#define YUVTORGB_SETUP                                                         \

     MEMACCESS([kUVToRB])                                                       \

     "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \

     MEMACCESS([kUVToG])                                                        \

@@ -107,7 +107,7 @@

     MEMACCESS([kYToRgb])                                                       \

     "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"

-#define YUV422TORGB                                                            \

+#define YUVTORGB                                                               \

     "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\

     "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\

     "vmovl.u8   q0, d0                         \n" /* Y                      */\

@@ -134,52 +134,19 @@

     "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \

     "vqshrun.s16 d21, q0, #6                   \n" /* G */

-// YUV to RGB conversion constants.

-// Y contribution to R,G,B.  Scale and bias.

-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

-#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */

-// U and V contributions to R,G,B.

-#define UB -128 /* -min(128, round(2.018 * 64)) */

-#define UG 25 /* -round(-0.391 * 64) */

-#define VG 52 /* -round(-0.813 * 64) */

-#define VR -102 /* -round(1.596 * 64) */

-// Bias values to subtract 16 from Y and 128 from U and V.

-#define BB (UB * 128            - YGB)

-#define BG (UG * 128 + VG * 128 - YGB)

-#define BR            (VR * 128 - YGB)

-static uvec8 kUVToRB  = { 128, 128, 128, 128, 102, 102, 102, 102,

-                          0, 0, 0, 0, 0, 0, 0, 0 };

-static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,

-                        0, 0, 0, 0, 0, 0, 0, 0 };

-static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };

-static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };

-#undef YG

-#undef YGB

-#undef UB

-#undef UG

-#undef VG

-#undef VR

-#undef BB

-#undef BG

-#undef BR

 void I444ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

+    "vmov.u8    d23, #255                      \n"

   "1:                                          \n"

     READYUV444

-    YUV422TORGB

+    YUVTORGB

     "subs       %4, %4, #8                     \n"

-    "vmov.u8    d23, #255                      \n"

     MEMACCESS(3)

     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

     "bgt        1b                             \n"

@@ -188,10 +155,10 @@

       "+r"(src_v),     // %2

       "+r"(dst_argb),  // %3

       "+r"(width)      // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -201,15 +168,15 @@

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

+    "vmov.u8    d23, #255                      \n"

   "1:                                          \n"

     READYUV422

-    YUV422TORGB

+    YUVTORGB

     "subs       %4, %4, #8                     \n"

-    "vmov.u8    d23, #255                      \n"

     MEMACCESS(3)

     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

     "bgt        1b                             \n"

@@ -218,90 +185,61 @@

       "+r"(src_v),     // %2

       "+r"(dst_argb),  // %3

       "+r"(width)      // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

-void I411ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

-                        int width) {

+void I422AlphaToARGBRow_NEON(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             const uint8* src_a,

+                             uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

+                             int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

   "1:                                          \n"

-    READYUV411

-    YUV422TORGB

-    "subs       %4, %4, #8                     \n"

-    "vmov.u8    d23, #255                      \n"

-    MEMACCESS(3)

-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_u),     // %1

-      "+r"(src_v),     // %2

-      "+r"(dst_argb),  // %3

-      "+r"(width)      // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

-}

-void I422ToBGRARow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_bgra,

-                        int width) {

-  asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

-  "1:                                          \n"

     READYUV422

-    YUV422TORGB

-    "subs       %4, %4, #8                     \n"

-    "vswp.u8    d20, d22                       \n"

-    "vmov.u8    d19, #255                      \n"

+    YUVTORGB

+    "subs       %5, %5, #8                     \n"

     MEMACCESS(3)

-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"

+    "vld1.8     {d23}, [%3]!                   \n"

+    MEMACCESS(4)

+    "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

       "+r"(src_u),     // %1

       "+r"(src_v),     // %2

-      "+r"(dst_bgra),  // %3

-      "+r"(width)      // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+      "+r"(src_a),     // %3

+      "+r"(dst_argb),  // %4

+      "+r"(width)      // %5

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

-void I422ToABGRRow_NEON(const uint8* src_y,

+void I411ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

-                        uint8* dst_abgr,

+                        uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

+    "vmov.u8    d23, #255                      \n"

   "1:                                          \n"

-    READYUV422

-    YUV422TORGB

+    READYUV411

+    YUVTORGB

     "subs       %4, %4, #8                     \n"

-    "vswp.u8    d20, d22                       \n"

-    "vmov.u8    d23, #255                      \n"

     MEMACCESS(3)

     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

     "bgt        1b                             \n"

@@ -308,12 +246,12 @@

     : "+r"(src_y),     // %0

       "+r"(src_u),     // %1

       "+r"(src_v),     // %2

-      "+r"(dst_abgr),  // %3

+      "+r"(dst_argb),  // %3

       "+r"(width)      // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -323,15 +261,15 @@

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_rgba,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

   "1:                                          \n"

     READYUV422

-    YUV422TORGB

+    YUVTORGB

     "subs       %4, %4, #8                     \n"

-    "vmov.u8    d19, #255                      \n"

+    "vmov.u8    d19, #255                      \n"  // d19 modified by YUVTORGB

     MEMACCESS(3)

     "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"

     "bgt        1b                             \n"

@@ -340,10 +278,10 @@

       "+r"(src_v),     // %2

       "+r"(dst_rgba),  // %3

       "+r"(width)      // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -353,13 +291,13 @@

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

   "1:                                          \n"

     READYUV422

-    YUV422TORGB

+    YUVTORGB

     "subs       %4, %4, #8                     \n"

     MEMACCESS(3)

     "vst3.8     {d20, d21, d22}, [%3]!         \n"

@@ -369,68 +307,33 @@

       "+r"(src_v),      // %2

       "+r"(dst_rgb24),  // %3

       "+r"(width)       // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

-void I422ToRAWRow_NEON(const uint8* src_y,

-                       const uint8* src_u,

-                       const uint8* src_v,

-                       uint8* dst_raw,

-                       int width) {

-  asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

-  "1:                                          \n"

-    READYUV422

-    YUV422TORGB

-    "subs       %4, %4, #8                     \n"

-    "vswp.u8    d20, d22                       \n"

-    MEMACCESS(3)

-    "vst3.8     {d20, d21, d22}, [%3]!         \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),    // %0

-      "+r"(src_u),    // %1

-      "+r"(src_v),    // %2

-      "+r"(dst_raw),  // %3

-      "+r"(width)     // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

-}

 #define ARGBTORGB565                                                           \

-    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \

-    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \

-    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \

-    "vmovl.u8   q8, d20                        \n"  /* B                    */ \

-    "vmovl.u8   q9, d21                        \n"  /* G                    */ \

-    "vmovl.u8   q10, d22                       \n"  /* R                    */ \

-    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \

-    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \

-    "vorr       q0, q8, q9                     \n"  /* BG                   */ \

-    "vorr       q0, q0, q10                    \n"  /* BGR                  */

+    "vshll.u8    q0, d22, #8                   \n"  /* R                    */ \

+    "vshll.u8    q8, d21, #8                   \n"  /* G                    */ \

+    "vshll.u8    q9, d20, #8                   \n"  /* B                    */ \

+    "vsri.16     q0, q8, #5                    \n"  /* RG                   */ \

+    "vsri.16     q0, q9, #11                   \n"  /* RGB                  */

 void I422ToRGB565Row_NEON(const uint8* src_y,

                           const uint8* src_u,

                           const uint8* src_v,

                           uint8* dst_rgb565,

+                          const struct YuvConstants* yuvconstants,

                           int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

   "1:                                          \n"

     READYUV422

-    YUV422TORGB

+    YUVTORGB

     "subs       %4, %4, #8                     \n"

     ARGBTORGB565

     MEMACCESS(3)

@@ -441,10 +344,10 @@

       "+r"(src_v),    // %2

       "+r"(dst_rgb565),  // %3

       "+r"(width)     // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -451,31 +354,25 @@

 #define ARGBTOARGB1555                                                         \

-    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \

-    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \

-    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \

-    "vmovl.u8   q8, d20                        \n"  /* B                    */ \

-    "vmovl.u8   q9, d21                        \n"  /* G                    */ \

-    "vmovl.u8   q10, d22                       \n"  /* R                    */ \

-    "vmovl.u8   q11, d23                       \n"  /* A                    */ \

-    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \

-    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \

-    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \

-    "vorr       q0, q8, q9                     \n"  /* BG                   */ \

-    "vorr       q1, q10, q11                   \n"  /* RA                   */ \

-    "vorr       q0, q0, q1                     \n"  /* BGRA                 */

+    "vshll.u8    q0, d23, #8                   \n"  /* A                    */ \

+    "vshll.u8    q8, d22, #8                   \n"  /* R                    */ \

+    "vshll.u8    q9, d21, #8                   \n"  /* G                    */ \

+    "vshll.u8    q10, d20, #8                  \n"  /* B                    */ \

+    "vsri.16     q0, q8, #1                    \n"  /* AR                   */ \

+    "vsri.16     q0, q9, #6                    \n"  /* ARG                  */ \

+    "vsri.16     q0, q10, #11                  \n"  /* ARGB                 */

 void I422ToARGB1555Row_NEON(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb1555,

+                            const struct YuvConstants* yuvconstants,

                             int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

   "1:                                          \n"

     READYUV422

-    YUV422TORGB

+    YUVTORGB

     "subs       %4, %4, #8                     \n"

     "vmov.u8    d23, #255                      \n"

     ARGBTOARGB1555

@@ -487,10 +384,10 @@

       "+r"(src_v),    // %2

       "+r"(dst_argb1555),  // %3

       "+r"(width)     // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -509,14 +406,14 @@

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb4444,

+                            const struct YuvConstants* yuvconstants,

                             int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.

-    ".p2align   2                              \n"

   "1:                                          \n"

     READYUV422

-    YUV422TORGB

+    YUVTORGB

     "subs       %4, %4, #8                     \n"

     "vmov.u8    d23, #255                      \n"

     ARGBTOARGB4444

@@ -528,10 +425,10 @@

       "+r"(src_v),    // %2

       "+r"(dst_argb4444),  // %3

       "+r"(width)     // %4

-    : [kUVToRB]"r"(&kUVToRB),   // %5

-      [kUVToG]"r"(&kUVToG),     // %6

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -541,13 +438,12 @@

                         uint8* dst_argb,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

+    "vmov.u8    d23, #255                      \n"

   "1:                                          \n"

     READYUV400

-    YUV422TORGB

+    YUVTORGB

     "subs       %2, %2, #8                     \n"

-    "vmov.u8    d23, #255                      \n"

     MEMACCESS(1)

     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

     "bgt        1b                             \n"

@@ -554,10 +450,10 @@

     : "+r"(src_y),     // %0

       "+r"(dst_argb),  // %1

       "+r"(width)      // %2

-    : [kUVToRB]"r"(&kUVToRB),   // %3

-      [kUVToG]"r"(&kUVToG),     // %4

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),

+      [kUVToG]"r"(&kYuvI601Constants.kUVToG),

+      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),

+      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -568,7 +464,6 @@

                         int width) {

   asm volatile (

     "vmov.u8    d23, #255                      \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {d20}, [%0]!                   \n"

@@ -589,15 +484,15 @@

 void NV12ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_uv,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

+    "vmov.u8    d23, #255                      \n"

   "1:                                          \n"

     READNV12

-    YUV422TORGB

+    YUVTORGB

     "subs       %3, %3, #8                     \n"

-    "vmov.u8    d23, #255                      \n"

     MEMACCESS(2)

     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

     "bgt        1b                             \n"

@@ -605,10 +500,10 @@

       "+r"(src_uv),    // %1

       "+r"(dst_argb),  // %2

       "+r"(width)      // %3

-    : [kUVToRB]"r"(&kUVToRB),   // %4

-      [kUVToG]"r"(&kUVToG),     // %5

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -615,28 +510,28 @@

 void NV21ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_uv,

+                        const uint8* src_vu,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

+    "vmov.u8    d23, #255                      \n"

   "1:                                          \n"

     READNV21

-    YUV422TORGB

+    YUVTORGB

     "subs       %3, %3, #8                     \n"

-    "vmov.u8    d23, #255                      \n"

     MEMACCESS(2)

     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

-      "+r"(src_uv),    // %1

+      "+r"(src_vu),    // %1

       "+r"(dst_argb),  // %2

       "+r"(width)      // %3

-    : [kUVToRB]"r"(&kUVToRB),   // %4

-      [kUVToG]"r"(&kUVToG),     // %5

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -645,13 +540,13 @@

 void NV12ToRGB565Row_NEON(const uint8* src_y,

                           const uint8* src_uv,

                           uint8* dst_rgb565,

+                          const struct YuvConstants* yuvconstants,

                           int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

   "1:                                          \n"

     READNV12

-    YUV422TORGB

+    YUVTORGB

     "subs       %3, %3, #8                     \n"

     ARGBTORGB565

     MEMACCESS(2)

@@ -661,54 +556,26 @@

       "+r"(src_uv),    // %1

       "+r"(dst_rgb565),  // %2

       "+r"(width)      // %3

-    : [kUVToRB]"r"(&kUVToRB),   // %4

-      [kUVToG]"r"(&kUVToG),     // %5

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

-void NV21ToRGB565Row_NEON(const uint8* src_y,

-                          const uint8* src_uv,

-                          uint8* dst_rgb565,

-                          int width) {

-  asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

-  "1:                                          \n"

-    READNV21

-    YUV422TORGB

-    "subs       %3, %3, #8                     \n"

-    ARGBTORGB565

-    MEMACCESS(2)

-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_uv),    // %1

-      "+r"(dst_rgb565),  // %2

-      "+r"(width)      // %3

-    : [kUVToRB]"r"(&kUVToRB),   // %4

-      [kUVToG]"r"(&kUVToG),     // %5

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

-}

 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

+    "vmov.u8    d23, #255                      \n"

   "1:                                          \n"

     READYUY2

-    YUV422TORGB

+    YUVTORGB

     "subs       %2, %2, #8                     \n"

-    "vmov.u8    d23, #255                      \n"

     MEMACCESS(1)

     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

     "bgt        1b                             \n"

@@ -715,10 +582,10 @@

     : "+r"(src_yuy2),  // %0

       "+r"(dst_argb),  // %1

       "+r"(width)      // %2

-    : [kUVToRB]"r"(&kUVToRB),   // %3

-      [kUVToG]"r"(&kUVToG),     // %4

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -726,15 +593,15 @@

 void UYVYToARGBRow_NEON(const uint8* src_uyvy,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

-    ".p2align   2                              \n"

+    YUVTORGB_SETUP

+    "vmov.u8    d23, #255                      \n"

   "1:                                          \n"

     READUYVY

-    YUV422TORGB

+    YUVTORGB

     "subs       %2, %2, #8                     \n"

-    "vmov.u8    d23, #255                      \n"

     MEMACCESS(1)

     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

     "bgt        1b                             \n"

@@ -741,10 +608,10 @@

     : "+r"(src_uyvy),  // %0

       "+r"(dst_argb),  // %1

       "+r"(width)      // %2

-    : [kUVToRB]"r"(&kUVToRB),   // %3

-      [kUVToG]"r"(&kUVToG),     // %4

-      [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

);

@@ -754,7 +621,6 @@

 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

                      int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV

@@ -777,7 +643,6 @@

 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

                      int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load U

@@ -800,7 +665,6 @@

 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.

 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32

@@ -855,7 +719,6 @@

     "add        %0, %0, %2                     \n"

     "sub        %0, #16                        \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

@@ -882,7 +745,6 @@

     "add        %0, %0, %3, lsl #1             \n"

     "sub        %0, #16                        \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16

@@ -909,7 +771,6 @@

     "add        %0, %0, %2, lsl #2             \n"

     "sub        %0, #16                        \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

@@ -928,10 +789,9 @@

);

-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {

+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {

   asm volatile (

     "vmov.u8    d4, #255                       \n"  // Alpha

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.

@@ -941,16 +801,15 @@

     "bgt        1b                             \n"

   : "+r"(src_rgb24),  // %0

     "+r"(dst_argb),   // %1

-    "+r"(pix)         // %2

+    "+r"(width)         // %2

   : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

);

-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {

+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {

   asm volatile (

     "vmov.u8    d4, #255                       \n"  // Alpha

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.

@@ -961,12 +820,30 @@

     "bgt        1b                             \n"

   : "+r"(src_raw),   // %0

     "+r"(dst_argb),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)      // %2

   : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

);

+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {

+  asm volatile (

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vswp.u8    d1, d3                         \n"  // swap R, B

+    MEMACCESS(1)

+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.

+    "bgt        1b                             \n"

+  : "+r"(src_raw),    // %0

+    "+r"(dst_rgb24),  // %1

+    "+r"(width)       // %2

+  :

+  : "cc", "memory", "d1", "d2", "d3"  // Clobber List

+  );

+}

 #define RGB565TOARGB                                                           \

     "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \

     "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \

@@ -979,10 +856,9 @@

     "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \

     "vorr.u8    d1, d4, d6                     \n"  /* G                    */

-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {

+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {

   asm volatile (

     "vmov.u8    d3, #255                       \n"  // Alpha

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

@@ -993,7 +869,7 @@

     "bgt        1b                             \n"

   : "+r"(src_rgb565),  // %0

     "+r"(dst_argb),    // %1

-    "+r"(pix)          // %2

+    "+r"(width)          // %2

   : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

);

@@ -1027,10 +903,9 @@

     "vorr.u8    d1, d4, d6                     \n"  /* G                    */

 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,

-                            int pix) {

+                            int width) {

   asm volatile (

     "vmov.u8    d3, #255                       \n"  // Alpha

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

@@ -1041,7 +916,7 @@

     "bgt        1b                             \n"

   : "+r"(src_argb1555),  // %0

     "+r"(dst_argb),    // %1

-    "+r"(pix)          // %2

+    "+r"(width)          // %2

   : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

);

@@ -1058,10 +933,9 @@

     "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */

 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,

-                            int pix) {

+                            int width) {

   asm volatile (

     "vmov.u8    d3, #255                       \n"  // Alpha

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

@@ -1072,15 +946,14 @@

     "bgt        1b                             \n"

   : "+r"(src_argb4444),  // %0

     "+r"(dst_argb),    // %1

-    "+r"(pix)          // %2

+    "+r"(width)          // %2

   : "cc", "memory", "q0", "q1", "q2"  // Clobber List

);

-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {

+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

@@ -1090,15 +963,14 @@

     "bgt        1b                             \n"

   : "+r"(src_argb),   // %0

     "+r"(dst_rgb24),  // %1

-    "+r"(pix)         // %2

+    "+r"(width)         // %2

   : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

);

-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {

+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

@@ -1109,15 +981,14 @@

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_raw),   // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

);

-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {

+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.

@@ -1127,15 +998,14 @@

     "bgt        1b                             \n"

   : "+r"(src_yuy2),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "q0", "q1"  // Clobber List

);

-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {

+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.

@@ -1145,7 +1015,7 @@

     "bgt        1b                             \n"

   : "+r"(src_uyvy),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "q0", "q1"  // Clobber List

);

@@ -1152,9 +1022,8 @@

 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,

-                         int pix) {

+                         int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

@@ -1167,7 +1036,7 @@

   : "+r"(src_yuy2),  // %0

     "+r"(dst_u),     // %1

     "+r"(dst_v),     // %2

-    "+r"(pix)        // %3

+    "+r"(width)        // %3

   : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List

);

@@ -1174,9 +1043,8 @@

 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,

-                         int pix) {

+                         int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

@@ -1189,7 +1057,7 @@

   : "+r"(src_uyvy),  // %0

     "+r"(dst_u),     // %1

     "+r"(dst_v),     // %2

-    "+r"(pix)        // %3

+    "+r"(width)        // %3

   : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List

);

@@ -1196,10 +1064,9 @@

 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // stride + src_yuy2

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

@@ -1217,7 +1084,7 @@

     "+r"(stride_yuy2),  // %1

     "+r"(dst_u),        // %2

     "+r"(dst_v),        // %3

-    "+r"(pix)           // %4

+    "+r"(width)           // %4

   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List

);

@@ -1224,10 +1091,9 @@

 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // stride + src_uyvy

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

@@ -1245,7 +1111,7 @@

     "+r"(stride_uyvy),  // %1

     "+r"(dst_u),        // %2

     "+r"(dst_v),        // %3

-    "+r"(pix)           // %4

+    "+r"(width)           // %4

   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List

);

@@ -1253,7 +1119,7 @@

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int pix) {

+                         const uint8* shuffler, int width) {

   asm volatile (

     MEMACCESS(3)

     "vld1.8     {q2}, [%3]                     \n"  // shuffler

@@ -1268,7 +1134,7 @@

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_argb),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "r"(shuffler)    // %3

   : "cc", "memory", "q0", "q1", "q2"  // Clobber List

);

@@ -1279,7 +1145,6 @@

                         const uint8* src_v,

                         uint8* dst_yuy2, int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys

@@ -1306,7 +1171,6 @@

                         const uint8* src_v,

                         uint8* dst_uyvy, int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys

@@ -1328,9 +1192,8 @@

);

-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {

+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

@@ -1341,7 +1204,7 @@

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_rgb565),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

);

@@ -1350,7 +1213,6 @@

 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,

                                 const uint32 dither4, int width) {

   asm volatile (

-    ".p2align   2                              \n"

     "vdup.32    d2, %2                         \n"  // dither4

   "1:                                          \n"

     MEMACCESS(1)

@@ -1372,9 +1234,8 @@

 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,

-                            int pix) {

+                            int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

@@ -1385,7 +1246,7 @@

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_argb1555),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

);

@@ -1392,10 +1253,9 @@

 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,

-                            int pix) {

+                            int width) {

   asm volatile (

     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

@@ -1406,19 +1266,18 @@

     "bgt        1b                             \n"

   : "+r"(src_argb),      // %0

     "+r"(dst_argb4444),  // %1

-    "+r"(pix)            // %2

+    "+r"(width)            // %2

   : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

);

-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {

   asm volatile (

     "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

     "vmov.u8    d27, #16                       \n"  // Add 16 constant

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -1433,18 +1292,35 @@

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

);

-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {

   asm volatile (

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels

+    "subs       %2, %2, #16                    \n"  // 16 processed per loop

+    MEMACCESS(1)

+    "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.

+    "bgt       1b                              \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_a),      // %1

+    "+r"(width)       // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

+  );

+}

+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {

+  asm volatile (

     "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient

     "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient

     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -1458,7 +1334,7 @@

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

);

@@ -1466,7 +1342,7 @@

 // 8x1 pixels.

 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int pix) {

+                         int width) {

   asm volatile (

     "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient

     "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient

@@ -1474,7 +1350,6 @@

     "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient

     "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -1500,65 +1375,15 @@

   : "+r"(src_argb),  // %0

     "+r"(dst_u),     // %1

     "+r"(dst_v),     // %2

-    "+r"(pix)        // %3

+    "+r"(width)        // %3

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"

);

-// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

-void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int pix) {

-  asm volatile (

-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

-    "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

-    MEMACCESS(0)

-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.

-    "vmul.s16   q8, q0, q10                    \n"  // B

-    "vmls.s16   q8, q1, q11                    \n"  // G

-    "vmls.s16   q8, q2, q12                    \n"  // R

-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

-    "vmul.s16   q9, q2, q10                    \n"  // R

-    "vmls.s16   q9, q1, q14                    \n"  // G

-    "vmls.s16   q9, q0, q13                    \n"  // B

-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

-    MEMACCESS(2)

-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_u),     // %1

-    "+r"(dst_v),     // %2

-    "+r"(pix)        // %3

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3",

-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

-}

-// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.

+// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.

 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int pix) {

+                         int width) {

   asm volatile (

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

@@ -1566,7 +1391,6 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -1613,7 +1437,7 @@

   : "+r"(src_argb),  // %0

     "+r"(dst_u),     // %1

     "+r"(dst_v),     // %2

-    "+r"(pix)        // %3

+    "+r"(width)        // %3

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -1620,7 +1444,7 @@

);

-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

 #define RGBTOUV(QB, QG, QR) \

     "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \

     "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \

@@ -1635,7 +1459,7 @@

 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.

 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_argb

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1644,7 +1468,6 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -1676,7 +1499,7 @@

     "+r"(src_stride_argb),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -1685,7 +1508,7 @@

 // TODO(fbarchard): Subsample match C code.

 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int pix) {

+                       uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_argb

     "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient

@@ -1694,7 +1517,6 @@

     "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient

     "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -1726,7 +1548,7 @@

     "+r"(src_stride_argb),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -1734,7 +1556,7 @@

 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_bgra

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1743,7 +1565,6 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.

@@ -1775,7 +1596,7 @@

     "+r"(src_stride_bgra),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -1783,7 +1604,7 @@

 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_abgr

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1792,7 +1613,6 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.

@@ -1824,7 +1644,7 @@

     "+r"(src_stride_abgr),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -1832,7 +1652,7 @@

 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_rgba

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1841,7 +1661,6 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.

@@ -1873,7 +1692,7 @@

     "+r"(src_stride_rgba),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -1881,7 +1700,7 @@

 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,

-                       uint8* dst_u, uint8* dst_v, int pix) {

+                       uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_rgb24

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1890,7 +1709,6 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.

@@ -1922,7 +1740,7 @@

     "+r"(src_stride_rgb24),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -1930,7 +1748,7 @@

 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,

-                     uint8* dst_u, uint8* dst_v, int pix) {

+                     uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_raw

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1939,7 +1757,6 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.

@@ -1971,7 +1788,7 @@

     "+r"(src_stride_raw),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -1978,9 +1795,9 @@

);

-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,

-                        uint8* dst_u, uint8* dst_v, int pix) {

+                        uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_argb

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1989,7 +1806,6 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

@@ -2041,7 +1857,7 @@

     "+r"(src_stride_rgb565),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -2048,9 +1864,9 @@

);

-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,

-                        uint8* dst_u, uint8* dst_v, int pix) {

+                        uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_argb

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -2059,7 +1875,6 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

@@ -2111,7 +1926,7 @@

     "+r"(src_stride_argb1555),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -2118,9 +1933,9 @@

);

-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,

-                          uint8* dst_u, uint8* dst_v, int pix) {

+                          uint8* dst_u, uint8* dst_v, int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_argb

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -2129,7 +1944,6 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

@@ -2181,7 +1995,7 @@

     "+r"(src_stride_argb4444),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@@ -2188,13 +2002,12 @@

);

-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {

+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {

   asm volatile (

     "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

     "vmov.u8    d27, #16                       \n"  // Add 16 constant

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

@@ -2210,19 +2023,18 @@

     "bgt        1b                             \n"

   : "+r"(src_rgb565),  // %0

     "+r"(dst_y),       // %1

-    "+r"(pix)          // %2

+    "+r"(width)          // %2

   : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

);

-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {

+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {

   asm volatile (

     "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

     "vmov.u8    d27, #16                       \n"  // Add 16 constant

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

@@ -2238,19 +2050,18 @@

     "bgt        1b                             \n"

   : "+r"(src_argb1555),  // %0

     "+r"(dst_y),         // %1

-    "+r"(pix)            // %2

+    "+r"(width)            // %2

   : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

);

-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {

+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {

   asm volatile (

     "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

     "vmov.u8    d27, #16                       \n"  // Add 16 constant

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

@@ -2266,19 +2077,18 @@

     "bgt        1b                             \n"

   : "+r"(src_argb4444),  // %0

     "+r"(dst_y),         // %1

-    "+r"(pix)            // %2

+    "+r"(width)            // %2

   : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

);

-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {

+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {

   asm volatile (

     "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

     "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

     "vmov.u8    d7, #16                        \n"  // Add 16 constant

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.

@@ -2293,19 +2103,18 @@

     "bgt        1b                             \n"

   : "+r"(src_bgra),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

);

-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {

+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {

   asm volatile (

     "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

     "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

     "vmov.u8    d7, #16                        \n"  // Add 16 constant

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.

@@ -2320,19 +2129,18 @@

     "bgt        1b                             \n"

   : "+r"(src_abgr),  // %0

     "+r"(dst_y),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

);

-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {

+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {

   asm volatile (

     "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient

     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

     "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient

     "vmov.u8    d7, #16                        \n"  // Add 16 constant

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.

@@ -2347,19 +2155,18 @@

     "bgt        1b                             \n"

   : "+r"(src_rgba),  // %0

     "+r"(dst_y),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

);

-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {

+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {

   asm volatile (

     "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient

     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

     "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient

     "vmov.u8    d7, #16                        \n"  // Add 16 constant

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.

@@ -2374,19 +2181,18 @@

     "bgt        1b                             \n"

   : "+r"(src_rgb24),  // %0

     "+r"(dst_y),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

);

-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {

+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {

   asm volatile (

     "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

     "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

     "vmov.u8    d7, #16                        \n"  // Add 16 constant

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.

@@ -2401,7 +2207,7 @@

     "bgt        1b                             \n"

   : "+r"(src_raw),  // %0

     "+r"(dst_y),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

);

@@ -2411,16 +2217,13 @@

 void InterpolateRow_NEON(uint8* dst_ptr,

                          const uint8* src_ptr, ptrdiff_t src_stride,

                          int dst_width, int source_y_fraction) {

+  int y1_fraction = source_y_fraction;

   asm volatile (

     "cmp        %4, #0                         \n"

     "beq        100f                           \n"

     "add        %2, %1                         \n"

-    "cmp        %4, #64                        \n"

-    "beq        75f                            \n"

     "cmp        %4, #128                       \n"

     "beq        50f                            \n"

-    "cmp        %4, #192                       \n"

-    "beq        25f                            \n"

     "vdup.8     d5, %4                         \n"

     "rsb        %4, #256                       \n"

@@ -2443,20 +2246,6 @@

     "bgt        1b                             \n"

     "b          99f                            \n"

-    // Blend 25 / 75.

-  "25:                                         \n"

-    MEMACCESS(1)

-    "vld1.8     {q0}, [%1]!                    \n"

-    MEMACCESS(2)

-    "vld1.8     {q1}, [%2]!                    \n"

-    "subs       %3, %3, #16                    \n"

-    "vrhadd.u8  q0, q1                         \n"

-    "vrhadd.u8  q0, q1                         \n"

-    MEMACCESS(0)

-    "vst1.8     {q0}, [%0]!                    \n"

-    "bgt        25b                            \n"

-    "b          99f                            \n"

     // Blend 50 / 50.

   "50:                                         \n"

     MEMACCESS(1)

@@ -2470,20 +2259,6 @@

     "bgt        50b                            \n"

     "b          99f                            \n"

-    // Blend 75 / 25.

-  "75:                                         \n"

-    MEMACCESS(1)

-    "vld1.8     {q1}, [%1]!                    \n"

-    MEMACCESS(2)

-    "vld1.8     {q0}, [%2]!                    \n"

-    "subs       %3, %3, #16                    \n"

-    "vrhadd.u8  q0, q1                         \n"

-    "vrhadd.u8  q0, q1                         \n"

-    MEMACCESS(0)

-    "vst1.8     {q0}, [%0]!                    \n"

-    "bgt        75b                            \n"

-    "b          99f                            \n"

     // Blend 100 / 0 - Copy row unchanged.

   "100:                                        \n"

     MEMACCESS(1)

@@ -2498,7 +2273,7 @@

     "+r"(src_ptr),          // %1

     "+r"(src_stride),       // %2

     "+r"(dst_width),        // %3

-    "+r"(source_y_fraction) // %4

+    "+r"(y1_fraction)       // %4

   : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"

);

@@ -2605,7 +2380,6 @@

     "vdup.u16   q10, %4                        \n"  // interval add

     // 8 pixel loop.

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.

@@ -2648,7 +2422,6 @@

     "vshr.u16   q0, q0, #1                     \n"  // scale / 2.

     // 8 pixel loop.

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.

@@ -2684,7 +2457,6 @@

     "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient

     "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient

     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -2721,7 +2493,6 @@

     "vmov.u8    d28, #24                       \n"  // BB coefficient

     "vmov.u8    d29, #98                       \n"  // BG coefficient

     "vmov.u8    d30, #50                       \n"  // BR coefficient

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.

@@ -2760,7 +2531,6 @@

     "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.

     "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.

@@ -2813,14 +2583,11 @@

);

-// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.

-#ifdef HAS_ARGBMULTIPLYROW_NEON

 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.

 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

                           uint8* dst_argb, int width) {

   asm volatile (

     // 8 pixel loop.

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -2847,7 +2614,6 @@

   : "cc", "memory", "q0", "q1", "q2", "q3"

);

-#endif  // HAS_ARGBMULTIPLYROW_NEON

 // Add 2 rows of ARGB pixels together, 8 pixels at a time.

 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

@@ -2854,7 +2620,6 @@

                      uint8* dst_argb, int width) {

   asm volatile (

     // 8 pixel loop.

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -2881,7 +2646,6 @@

                           uint8* dst_argb, int width) {

   asm volatile (

     // 8 pixel loop.

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -2913,7 +2677,6 @@

   asm volatile (

     "vmov.u8    d3, #255                       \n"  // alpha

     // 8 pixel loop.

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.

@@ -2940,7 +2703,6 @@

                           uint8* dst_y, int width) {

   asm volatile (

     // 16 pixel loop.

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.

@@ -2970,7 +2732,6 @@

   asm volatile (

     "vmov.u8    d3, #255                       \n"  // alpha

     // 8 pixel loop.

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.

@@ -2997,7 +2758,6 @@

 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,

                     const uint8* src_y2, uint8* dst_sobelx, int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {d0}, [%0],%5                  \n"  // top

@@ -3041,7 +2801,6 @@

 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,

                     uint8* dst_sobely, int width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {d0}, [%0],%4                  \n"  // left

--- a/third_party/libyuv/source/row_neon64.cc

+++ b/third_party/libyuv/source/row_neon64.cc

@@ -91,17 +91,15 @@

     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \

     "ins        v1.s[1], v3.s[0]               \n"

-#define YUV422TORGB_SETUP_REG                                                  \

+#define YUVTORGB_SETUP                                                         \

     "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \

     "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \

     "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \

     "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \

-    "movi       v27.8h, #128                   \n"                             \

-    "movi       v28.8h, #102                   \n"                             \

-    "movi       v29.8h, #25                    \n"                             \

-    "movi       v30.8h, #52                    \n"

+    "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n"                             \

+    "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"

-#define YUV422TORGB(vR, vG, vB)                                                \

+#define YUVTORGB(vR, vG, vB)                                                   \

     "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \

     "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \

     "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \

@@ -129,57 +127,19 @@

     "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \

     "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \

-// YUV to RGB conversion constants.

-// Y contribution to R,G,B.  Scale and bias.

-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

-#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */

-// U and V contributions to R,G,B.

-#define UB -128 /* -min(128, round(2.018 * 64)) */

-#define UG 25 /* -round(-0.391 * 64) */

-#define VG 52 /* -round(-0.813 * 64) */

-#define VR -102 /* -round(1.596 * 64) */

-// Bias values to subtract 16 from Y and 128 from U and V.

-#define BB (UB * 128            - YGB)

-#define BG (UG * 128 + VG * 128 - YGB)

-#define BR            (VR * 128 - YGB)

-static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };

-static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };

-#undef YG

-#undef YGB

-#undef UB

-#undef UG

-#undef VG

-#undef VR

-#undef BB

-#undef BG

-#undef BR

-#define RGBTOUV_SETUP_REG                                                      \

-    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \

-    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \

-    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \

-    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \

-    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \

-    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */

-#ifdef HAS_I444TOARGBROW_NEON

 void I444ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

+    "movi       v23.8b, #255                   \n" /* A */

   "1:                                          \n"

     READYUV444

-    YUV422TORGB(v22, v21, v20)

-    "subs       %w4, %w4, #8                 \n"

-    "movi       v23.8b, #255                   \n" /* A */

+    YUVTORGB(v22, v21, v20)

+    "subs       %w4, %w4, #8                   \n"

     MEMACCESS(3)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"

     "b.gt       1b                             \n"

@@ -188,27 +148,28 @@

       "+r"(src_v),     // %2

       "+r"(dst_argb),  // %3

       "+r"(width)      // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_I444TOARGBROW_NEON

-#ifdef HAS_I422TOARGBROW_NEON

 void I422ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

+    "movi       v23.8b, #255                   \n" /* A */

   "1:                                          \n"

     READYUV422

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w4, %w4, #8                   \n"

-    "movi       v23.8b, #255                   \n" /* A */

     MEMACCESS(3)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"

     "b.gt       1b                             \n"

@@ -217,85 +178,61 @@

       "+r"(src_v),     // %2

       "+r"(dst_argb),  // %3

       "+r"(width)      // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_I422TOARGBROW_NEON

-#ifdef HAS_I411TOARGBROW_NEON

-void I411ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

-                        int width) {

+void I422AlphaToARGBRow_NEON(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             const uint8* src_a,

+                             uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

+                             int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

   "1:                                          \n"

-    READYUV411

-    YUV422TORGB(v22, v21, v20)

-    "subs       %w4, %w4, #8                   \n"

-    "movi       v23.8b, #255                   \n" /* A */

-    MEMACCESS(3)

-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"

-    "b.gt       1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_u),     // %1

-      "+r"(src_v),     // %2

-      "+r"(dst_argb),  // %3

-      "+r"(width)      // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

-  );

-}

-#endif  // HAS_I411TOARGBROW_NEON

-#ifdef HAS_I422TOBGRAROW_NEON

-void I422ToBGRARow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_bgra,

-                        int width) {

-  asm volatile (

-    YUV422TORGB_SETUP_REG

-  "1:                                          \n"

     READYUV422

-    YUV422TORGB(v21, v22, v23)

-    "subs       %w4, %w4, #8                   \n"

-    "movi       v20.8b, #255                   \n" /* A */

+    YUVTORGB(v22, v21, v20)

     MEMACCESS(3)

-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"

+    "ld1        {v23.8b}, [%3], #8             \n"

+    "subs       %w5, %w5, #8                   \n"

+    MEMACCESS(4)

+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

       "+r"(src_u),     // %1

       "+r"(src_v),     // %2

-      "+r"(dst_bgra),  // %3

-      "+r"(width)      // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+      "+r"(src_a),     // %3

+      "+r"(dst_argb),  // %4

+      "+r"(width)      // %5

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_I422TOBGRAROW_NEON

-#ifdef HAS_I422TOABGRROW_NEON

-void I422ToABGRRow_NEON(const uint8* src_y,

+void I411ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

-                        uint8* dst_abgr,

+                        uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

+    "movi       v23.8b, #255                   \n" /* A */

   "1:                                          \n"

-    READYUV422

-    YUV422TORGB(v20, v21, v22)

+    READYUV411

+    YUVTORGB(v22, v21, v20)

     "subs       %w4, %w4, #8                   \n"

-    "movi       v23.8b, #255                   \n" /* A */

     MEMACCESS(3)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"

     "b.gt       1b                             \n"

@@ -302,29 +239,30 @@

     : "+r"(src_y),     // %0

       "+r"(src_u),     // %1

       "+r"(src_v),     // %2

-      "+r"(dst_abgr),  // %3

+      "+r"(dst_argb),  // %3

       "+r"(width)      // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_I422TOABGRROW_NEON

-#ifdef HAS_I422TORGBAROW_NEON

 void I422ToRGBARow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

                         uint8* dst_rgba,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

+    "movi       v20.8b, #255                   \n" /* A */

   "1:                                          \n"

     READYUV422

-    YUV422TORGB(v23, v22, v21)

+    YUVTORGB(v23, v22, v21)

     "subs       %w4, %w4, #8                   \n"

-    "movi       v20.8b, #255                   \n" /* A */

     MEMACCESS(3)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"

     "b.gt       1b                             \n"

@@ -333,25 +271,26 @@

       "+r"(src_v),     // %2

       "+r"(dst_rgba),  // %3

       "+r"(width)      // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_I422TORGBAROW_NEON

-#ifdef HAS_I422TORGB24ROW_NEON

 void I422ToRGB24Row_NEON(const uint8* src_y,

                          const uint8* src_u,

                          const uint8* src_v,

                          uint8* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

   "1:                                          \n"

     READYUV422

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w4, %w4, #8                   \n"

     MEMACCESS(3)

     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"

@@ -361,60 +300,33 @@

       "+r"(src_v),     // %2

       "+r"(dst_rgb24), // %3

       "+r"(width)      // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_I422TORGB24ROW_NEON

-#ifdef HAS_I422TORAWROW_NEON

-void I422ToRAWRow_NEON(const uint8* src_y,

-                       const uint8* src_u,

-                       const uint8* src_v,

-                       uint8* dst_raw,

-                       int width) {

-  asm volatile (

-    YUV422TORGB_SETUP_REG

-  "1:                                          \n"

-    READYUV422

-    YUV422TORGB(v20, v21, v22)

-    "subs       %w4, %w4, #8                   \n"

-    MEMACCESS(3)

-    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"

-    "b.gt       1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_u),     // %1

-      "+r"(src_v),     // %2

-      "+r"(dst_raw),   // %3

-      "+r"(width)      // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

-  );

-}

-#endif  // HAS_I422TORAWROW_NEON

 #define ARGBTORGB565                                                           \

     "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \

-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \

     "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \

+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \

     "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \

     "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */

-#ifdef HAS_I422TORGB565ROW_NEON

 void I422ToRGB565Row_NEON(const uint8* src_y,

                           const uint8* src_u,

                           const uint8* src_v,

                           uint8* dst_rgb565,

+                          const struct YuvConstants* yuvconstants,

                           int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

   "1:                                          \n"

     READYUV422

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w4, %w4, #8                   \n"

     ARGBTORGB565

     MEMACCESS(3)

@@ -425,36 +337,37 @@

       "+r"(src_v),    // %2

       "+r"(dst_rgb565),  // %3

       "+r"(width)     // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_I422TORGB565ROW_NEON

 #define ARGBTOARGB1555                                                         \

     "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \

     "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \

-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \

     "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \

+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \

     "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \

     "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \

     "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */

-#ifdef HAS_I422TOARGB1555ROW_NEON

 void I422ToARGB1555Row_NEON(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb1555,

+                            const struct YuvConstants* yuvconstants,

                             int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

+    "movi       v23.8b, #255                   \n"

   "1:                                          \n"

     READYUV422

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w4, %w4, #8                   \n"

-    "movi       v23.8b, #255                   \n"

     ARGBTOARGB1555

     MEMACCESS(3)

     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.

@@ -464,13 +377,14 @@

       "+r"(src_v),    // %2

       "+r"(dst_argb1555),  // %3

       "+r"(width)     // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_I422TOARGB1555ROW_NEON

 #define ARGBTOARGB4444                                                         \

     /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \

@@ -482,18 +396,18 @@

     "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \

     "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */

-#ifdef HAS_I422TOARGB4444ROW_NEON

 void I422ToARGB4444Row_NEON(const uint8* src_y,

                             const uint8* src_u,

                             const uint8* src_v,

                             uint8* dst_argb4444,

+                            const struct YuvConstants* yuvconstants,

                             int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.

   "1:                                          \n"

     READYUV422

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w4, %w4, #8                   \n"

     "movi       v23.8b, #255                   \n"

     ARGBTOARGB4444

@@ -505,41 +419,40 @@

       "+r"(src_v),    // %2

       "+r"(dst_argb4444),  // %3

       "+r"(width)     // %4

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_I422TOARGB4444ROW_NEON

-#ifdef HAS_I400TOARGBROW_NEON

 void I400ToARGBRow_NEON(const uint8* src_y,

                         uint8* dst_argb,

                         int width) {

-  int64 width64 = (int64)(width);

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

+    "movi       v23.8b, #255                   \n"

   "1:                                          \n"

     READYUV400

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w2, %w2, #8                   \n"

-    "movi       v23.8b, #255                   \n"

     MEMACCESS(1)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

       "+r"(dst_argb),  // %1

-      "+r"(width64)    // %2

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+      "+r"(width)      // %2

+    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),

+      [kUVToG]"r"(&kYuvI601Constants.kUVToG),

+      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),

+      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_I400TOARGBROW_NEON

-#ifdef HAS_J400TOARGBROW_NEON

 void J400ToARGBRow_NEON(const uint8* src_y,

                         uint8* dst_argb,

                         int width) {

@@ -561,20 +474,19 @@

     : "cc", "memory", "v20", "v21", "v22", "v23"

);

-#endif  // HAS_J400TOARGBROW_NEON

-#ifdef HAS_NV12TOARGBROW_NEON

 void NV12ToARGBRow_NEON(const uint8* src_y,

                         const uint8* src_uv,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

+    "movi       v23.8b, #255                   \n"

   "1:                                          \n"

     READNV12

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w3, %w3, #8                   \n"

-    "movi       v23.8b, #255                   \n"

     MEMACCESS(2)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"

     "b.gt       1b                             \n"

@@ -582,51 +494,53 @@

       "+r"(src_uv),    // %1

       "+r"(dst_argb),  // %2

       "+r"(width)      // %3

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_NV12TOARGBROW_NEON

-#ifdef HAS_NV21TOARGBROW_NEON

 void NV21ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_uv,

+                        const uint8* src_vu,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

+    "movi       v23.8b, #255                   \n"

   "1:                                          \n"

     READNV21

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w3, %w3, #8                   \n"

-    "movi       v23.8b, #255                   \n"

     MEMACCESS(2)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

-      "+r"(src_uv),    // %1

+      "+r"(src_vu),    // %1

       "+r"(dst_argb),  // %2

       "+r"(width)      // %3

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_NV21TOARGBROW_NEON

-#ifdef HAS_NV12TORGB565ROW_NEON

 void NV12ToRGB565Row_NEON(const uint8* src_y,

                           const uint8* src_uv,

                           uint8* dst_rgb565,

+                          const struct YuvConstants* yuvconstants,

                           int width) {

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

   "1:                                          \n"

     READNV12

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w3, %w3, #8                   \n"

     ARGBTORGB565

     MEMACCESS(2)

@@ -636,95 +550,68 @@

       "+r"(src_uv),    // %1

       "+r"(dst_rgb565),  // %2

       "+r"(width)      // %3

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_NV12TORGB565ROW_NEON

-#ifdef HAS_NV21TORGB565ROW_NEON

-void NV21ToRGB565Row_NEON(const uint8* src_y,

-                          const uint8* src_uv,

-                          uint8* dst_rgb565,

-                          int width) {

-  asm volatile (

-    YUV422TORGB_SETUP_REG

-  "1:                                          \n"

-    READNV21

-    YUV422TORGB(v22, v21, v20)

-    "subs       %w3, %w3, #8                   \n"

-    ARGBTORGB565

-    MEMACCESS(2)

-    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.

-    "b.gt       1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_uv),    // %1

-      "+r"(dst_rgb565),  // %2

-      "+r"(width)      // %3

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

-  );

-}

-#endif  // HAS_NV21TORGB565ROW_NEON

-#ifdef HAS_YUY2TOARGBROW_NEON

 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

-  int64 width64 = (int64)(width);

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

+    "movi       v23.8b, #255                   \n"

   "1:                                          \n"

     READYUY2

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w2, %w2, #8                   \n"

-    "movi       v23.8b, #255                   \n"

     MEMACCESS(1)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"

     "b.gt       1b                             \n"

     : "+r"(src_yuy2),  // %0

       "+r"(dst_argb),  // %1

-      "+r"(width64)    // %2

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+      "+r"(width)      // %2

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_YUY2TOARGBROW_NEON

-#ifdef HAS_UYVYTOARGBROW_NEON

 void UYVYToARGBRow_NEON(const uint8* src_uyvy,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

-  int64 width64 = (int64)(width);

   asm volatile (

-    YUV422TORGB_SETUP_REG

+    YUVTORGB_SETUP

+    "movi       v23.8b, #255                   \n"

   "1:                                          \n"

     READUYVY

-    YUV422TORGB(v22, v21, v20)

+    YUVTORGB(v22, v21, v20)

     "subs       %w2, %w2, #8                   \n"

-    "movi       v23.8b, #255                   \n"

     MEMACCESS(1)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"

     "b.gt       1b                             \n"

     : "+r"(src_uyvy),  // %0

       "+r"(dst_argb),  // %1

-      "+r"(width64)    // %2

-    : [kUVBiasBGR]"r"(&kUVBiasBGR),

-      [kYToRgb]"r"(&kYToRgb)

+      "+r"(width)      // %2

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

);

-#endif  // HAS_UYVYTOARGBROW_NEON

 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.

-#ifdef HAS_SPLITUVROW_NEON

 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

                      int width) {

   asm volatile (

@@ -745,10 +632,8 @@

     : "cc", "memory", "v0", "v1"  // Clobber List

);

-#endif  // HAS_SPLITUVROW_NEON

 // Reads 16 U's and V's and writes out 16 pairs of UV.

-#ifdef HAS_MERGEUVROW_NEON

 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

                      int width) {

   asm volatile (

@@ -770,10 +655,8 @@

     : "cc", "memory", "v0", "v1"  // Clobber List

);

-#endif  // HAS_MERGEUVROW_NEON

 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.

-#ifdef HAS_COPYROW_NEON

 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {

   asm volatile (

   "1:                                          \n"

@@ -790,7 +673,6 @@

   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

);

-#endif  // HAS_COPYROW_NEON

 // SetRow writes 'count' bytes using an 8 bit value repeated.

 void SetRow_NEON(uint8* dst, uint8 v8, int count) {

@@ -797,10 +679,10 @@

   asm volatile (

     "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes

   "1:                                          \n"

-    "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop

+    "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop

     MEMACCESS(0)

     "st1        {v0.16b}, [%0], #16            \n"  // store

-    "b.gt      1b                              \n"

+    "b.gt       1b                             \n"

   : "+r"(dst),   // %0

     "+r"(count)  // %1

   : "r"(v8)      // %2

@@ -812,10 +694,10 @@

   asm volatile (

     "dup        v0.4s, %w2                     \n"  // duplicate 4 ints

   "1:                                          \n"

-    "subs      %w1, %w1, #4                    \n"  // 4 ints per loop

+    "subs       %w1, %w1, #4                   \n"  // 4 ints per loop

     MEMACCESS(0)

     "st1        {v0.16b}, [%0], #16            \n"  // store

-    "b.gt      1b                              \n"

+    "b.gt       1b                             \n"

   : "+r"(dst),   // %0

     "+r"(count)  // %1

   : "r"(v32)     // %2

@@ -823,18 +705,15 @@

);

-#ifdef HAS_MIRRORROW_NEON

 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {

-  int64 width64 = (int64) width;

   asm volatile (

     // Start at end of source row.

-    "add        %0, %0, %2                     \n"

+    "add        %0, %0, %w2, sxtw              \n"

     "sub        %0, %0, #16                    \n"

   "1:                                          \n"

     MEMACCESS(0)

     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16

-    "subs       %2, %2, #16                   \n"  // 16 pixels per loop.

+    "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.

     "rev64      v0.16b, v0.16b                 \n"

     MEMACCESS(1)

     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16

@@ -843,26 +722,22 @@

     "b.gt       1b                             \n"

   : "+r"(src),   // %0

     "+r"(dst),   // %1

-    "+r"(width64)  // %2

+    "+r"(width)  // %2

   : "r"((ptrdiff_t)-16)    // %3

   : "cc", "memory", "v0"

);

-#endif  // HAS_MIRRORROW_NEON

-#ifdef HAS_MIRRORUVROW_NEON

 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

                       int width) {

-  int64 width64 = (int64) width;

   asm volatile (

     // Start at end of source row.

-    "add        %0, %0, %3, lsl #1             \n"

+    "add        %0, %0, %w3, sxtw #1           \n"

     "sub        %0, %0, #16                    \n"

   "1:                                          \n"

     MEMACCESS(0)

     "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16

-    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.

+    "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.

     "rev64      v0.8b, v0.8b                   \n"

     "rev64      v1.8b, v1.8b                   \n"

     MEMACCESS(1)

@@ -873,25 +748,21 @@

   : "+r"(src_uv),  // %0

     "+r"(dst_u),   // %1

     "+r"(dst_v),   // %2

-    "+r"(width64)    // %3

+    "+r"(width)    // %3

   : "r"((ptrdiff_t)-16)      // %4

   : "cc", "memory", "v0", "v1"

);

-#endif  // HAS_MIRRORUVROW_NEON

-#ifdef HAS_ARGBMIRRORROW_NEON

 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {

-  int64 width64 = (int64) width;

   asm volatile (

-    // Start at end of source row.

-    "add        %0, %0, %2, lsl #2             \n"

+  // Start at end of source row.

+    "add        %0, %0, %w2, sxtw #2           \n"

     "sub        %0, %0, #16                    \n"

   "1:                                          \n"

     MEMACCESS(0)

     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16

-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.

+    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.

     "rev64      v0.4s, v0.4s                   \n"

     MEMACCESS(1)

     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16

@@ -900,15 +771,13 @@

     "b.gt       1b                             \n"

   : "+r"(src),   // %0

     "+r"(dst),   // %1

-    "+r"(width64)  // %2

+    "+r"(width)  // %2

   : "r"((ptrdiff_t)-16)    // %3

   : "cc", "memory", "v0"

);

-#endif  // HAS_ARGBMIRRORROW_NEON

-#ifdef HAS_RGB24TOARGBROW_NEON

-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {

+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {

   asm volatile (

     "movi       v4.8b, #255                    \n"  // Alpha

   "1:                                          \n"

@@ -920,15 +789,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_rgb24),  // %0

     "+r"(dst_argb),   // %1

-    "+r"(pix)         // %2

+    "+r"(width)       // %2

   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List

);

-#endif  // HAS_RGB24TOARGBROW_NEON

-#ifdef HAS_RAWTOARGBROW_NEON

-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {

+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {

   asm volatile (

     "movi       v5.8b, #255                    \n"  // Alpha

   "1:                                          \n"

@@ -942,13 +809,31 @@

     "b.gt       1b                             \n"

   : "+r"(src_raw),   // %0

     "+r"(dst_argb),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)      // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List

);

-#endif  // HAS_RAWTOARGBROW_NEON

+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {

+  asm volatile (

+  "1:                                          \n"

+    MEMACCESS(0)

+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b

+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g

+    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r

+    MEMACCESS(1)

+    "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r

+    "b.gt       1b                             \n"

+  : "+r"(src_raw),    // %0

+    "+r"(dst_rgb24),  // %1

+    "+r"(width)       // %2

+  :

+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List

+  );

+}

 #define RGB565TOARGB                                                           \

     "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \

     "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \

@@ -962,8 +847,7 @@

     "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \

     "dup        v2.2D, v0.D[1]                 \n"  /* R                    */

-#ifdef HAS_RGB565TOARGBROW_NEON

-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {

+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {

   asm volatile (

     "movi       v3.8b, #255                    \n"  // Alpha

   "1:                                          \n"

@@ -976,12 +860,11 @@

     "b.gt       1b                             \n"

   : "+r"(src_rgb565),  // %0

     "+r"(dst_argb),    // %1

-    "+r"(pix)          // %2

+    "+r"(width)          // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List

);

-#endif  // HAS_RGB565TOARGBROW_NEON

 #define ARGB1555TOARGB                                                         \

     "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \

@@ -1020,9 +903,8 @@

     "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \

     "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \

-#ifdef HAS_ARGB1555TOARGBROW_NEON

 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,

-                            int pix) {

+                            int width) {

   asm volatile (

     "movi       v3.8b, #255                    \n"  // Alpha

   "1:                                          \n"

@@ -1035,12 +917,11 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb1555),  // %0

     "+r"(dst_argb),    // %1

-    "+r"(pix)          // %2

+    "+r"(width)          // %2

   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

);

-#endif  // HAS_ARGB1555TOARGBROW_NEON

 #define ARGB4444TOARGB                                                         \

     "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \

@@ -1054,9 +935,8 @@

     "dup        v0.2D, v2.D[1]                 \n"                             \

     "dup        v1.2D, v3.D[1]                 \n"

-#ifdef HAS_ARGB4444TOARGBROW_NEON

 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,

-                            int pix) {

+                            int width) {

   asm volatile (

   "1:                                          \n"

     MEMACCESS(0)

@@ -1068,15 +948,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb4444),  // %0

     "+r"(dst_argb),    // %1

-    "+r"(pix)          // %2

+    "+r"(width)          // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List

);

-#endif  // HAS_ARGB4444TOARGBROW_NEON

-#ifdef HAS_ARGBTORGB24ROW_NEON

-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {

+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {

   asm volatile (

   "1:                                          \n"

     MEMACCESS(0)

@@ -1087,15 +965,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb),   // %0

     "+r"(dst_rgb24),  // %1

-    "+r"(pix)         // %2

+    "+r"(width)         // %2

   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List

);

-#endif  // HAS_ARGBTORGB24ROW_NEON

-#ifdef HAS_ARGBTORAWROW_NEON

-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {

+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {

   asm volatile (

   "1:                                          \n"

     MEMACCESS(0)

@@ -1108,15 +984,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_raw),   // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List

);

-#endif  // HAS_ARGBTORAWROW_NEON

-#ifdef HAS_YUY2TOYROW_NEON

-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {

+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {

   asm volatile (

   "1:                                          \n"

     MEMACCESS(0)

@@ -1127,15 +1001,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_yuy2),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "v0", "v1"  // Clobber List

);

-#endif  // HAS_YUY2TOYROW_NEON

-#ifdef HAS_UYVYTOYROW_NEON

-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {

+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {

   asm volatile (

   "1:                                          \n"

     MEMACCESS(0)

@@ -1146,16 +1018,14 @@

     "b.gt       1b                             \n"

   : "+r"(src_uyvy),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "v0", "v1"  // Clobber List

);

-#endif  // HAS_UYVYTOYROW_NEON

-#ifdef HAS_YUY2TOUV422ROW_NEON

 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,

-                         int pix) {

+                         int width) {

   asm volatile (

   "1:                                          \n"

     MEMACCESS(0)

@@ -1169,16 +1039,14 @@

   : "+r"(src_yuy2),  // %0

     "+r"(dst_u),     // %1

     "+r"(dst_v),     // %2

-    "+r"(pix)        // %3

+    "+r"(width)        // %3

   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

);

-#endif  // HAS_YUY2TOUV422ROW_NEON

-#ifdef HAS_UYVYTOUV422ROW_NEON

 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,

-                         int pix) {

+                         int width) {

   asm volatile (

   "1:                                          \n"

     MEMACCESS(0)

@@ -1192,16 +1060,14 @@

   : "+r"(src_uyvy),  // %0

     "+r"(dst_u),     // %1

     "+r"(dst_v),     // %2

-    "+r"(pix)        // %3

+    "+r"(width)        // %3

   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

);

-#endif  // HAS_UYVYTOUV422ROW_NEON

-#ifdef HAS_YUY2TOUVROW_NEON

 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_yuy2b = src_yuy2 + stride_yuy2;

   asm volatile (

   "1:                                          \n"

@@ -1221,17 +1087,15 @@

     "+r"(src_yuy2b),    // %1

     "+r"(dst_u),        // %2

     "+r"(dst_v),        // %3

-    "+r"(pix)           // %4

+    "+r"(width)           // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",

     "v5", "v6", "v7"  // Clobber List

);

-#endif  // HAS_YUY2TOUVROW_NEON

-#ifdef HAS_UYVYTOUVROW_NEON

 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_uyvyb = src_uyvy + stride_uyvy;

   asm volatile (

   "1:                                          \n"

@@ -1251,18 +1115,16 @@

     "+r"(src_uyvyb),    // %1

     "+r"(dst_u),        // %2

     "+r"(dst_v),        // %3

-    "+r"(pix)           // %4

+    "+r"(width)           // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",

     "v5", "v6", "v7"  // Clobber List

);

-#endif  // HAS_UYVYTOUVROW_NEON

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

-#ifdef HAS_ARGBSHUFFLEROW_NEON

 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int pix) {

+                         const uint8* shuffler, int width) {

   asm volatile (

     MEMACCESS(3)

     "ld1        {v2.16b}, [%3]                 \n"  // shuffler

@@ -1276,14 +1138,12 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_argb),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "r"(shuffler)    // %3

   : "cc", "memory", "v0", "v1", "v2"  // Clobber List

);

-#endif  // HAS_ARGBSHUFFLEROW_NEON

-#ifdef HAS_I422TOYUY2ROW_NEON

 void I422ToYUY2Row_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

@@ -1310,9 +1170,7 @@

   : "cc", "memory", "v0", "v1", "v2", "v3"

);

-#endif  // HAS_I422TOYUY2ROW_NEON

-#ifdef HAS_I422TOUYVYROW_NEON

 void I422ToUYVYRow_NEON(const uint8* src_y,

                         const uint8* src_u,

                         const uint8* src_v,

@@ -1339,10 +1197,8 @@

   : "cc", "memory", "v0", "v1", "v2", "v3"

);

-#endif  // HAS_I422TOUYVYROW_NEON

-#ifdef HAS_ARGBTORGB565ROW_NEON

-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {

+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {

   asm volatile (

   "1:                                          \n"

     MEMACCESS(0)

@@ -1354,14 +1210,12 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_rgb565),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"

);

-#endif  // HAS_ARGBTORGB565ROW_NEON

-#ifdef HAS_ARGBTORGB565DITHERROW_NEON

 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,

                                 const uint32 dither4, int width) {

   asm volatile (

@@ -1384,11 +1238,9 @@

   : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"

);

-#endif  // HAS_ARGBTORGB565ROW_NEON

-#ifdef HAS_ARGBTOARGB1555ROW_NEON

 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,

-                            int pix) {

+                            int width) {

   asm volatile (

   "1:                                          \n"

     MEMACCESS(0)

@@ -1400,16 +1252,14 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_argb1555),  // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"

);

-#endif  // HAS_ARGBTOARGB1555ROW_NEON

-#ifdef HAS_ARGBTOARGB4444ROW_NEON

 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,

-                            int pix) {

+                            int width) {

   asm volatile (

     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.

   "1:                                          \n"

@@ -1422,15 +1272,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb),      // %0

     "+r"(dst_argb4444),  // %1

-    "+r"(pix)            // %2

+    "+r"(width)            // %2

   : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"

);

-#endif  // HAS_ARGBTOARGB4444ROW_NEON

-#ifdef HAS_ARGBTOYROW_NEON

-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {

   asm volatile (

     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

@@ -1450,16 +1298,31 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"

);

-#endif  // HAS_ARGBTOYROW_NEON

-#ifdef HAS_ARGBTOYJROW_NEON

-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {

   asm volatile (

+  "1:                                          \n"

+    MEMACCESS(0)

+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16 pixels

+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop

+    MEMACCESS(1)

+    "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.

+    "b.gt       1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_a),      // %1

+    "+r"(width)       // %2

+  :

+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

+  );

+}

+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {

+  asm volatile (

     "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient

     "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient

     "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient

@@ -1476,17 +1339,15 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"

);

-#endif  // HAS_ARGBTOYJROW_NEON

 // 8x1 pixels.

-#ifdef HAS_ARGBTOUV444ROW_NEON

 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int pix) {

+                         int width) {

   asm volatile (

     "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient

     "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient

@@ -1519,62 +1380,24 @@

   : "+r"(src_argb),  // %0

     "+r"(dst_u),     // %1

     "+r"(dst_v),     // %2

-    "+r"(pix)        // %3

+    "+r"(width)        // %3

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",

     "v24", "v25", "v26", "v27", "v28", "v29"

);

-#endif  // HAS_ARGBTOUV444ROW_NEON

-// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

-#ifdef HAS_ARGBTOUV422ROW_NEON

-void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int pix) {

-  asm volatile (

-    RGBTOUV_SETUP_REG

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.

+#define RGBTOUV_SETUP_REG                                                      \

+    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \

+    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \

+    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \

+    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \

+    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \

+    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */

-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.

-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.

-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.

-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.

-    "mul        v3.8h, v0.8h, v20.8h           \n"  // B

-    "mls        v3.8h, v1.8h, v21.8h           \n"  // G

-    "mls        v3.8h, v2.8h, v22.8h           \n"  // R

-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned

-    "mul        v4.8h, v2.8h, v20.8h           \n"  // R

-    "mls        v4.8h, v1.8h, v24.8h           \n"  // G

-    "mls        v4.8h, v0.8h, v23.8h           \n"  // B

-    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned

-    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U

-    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.

-    MEMACCESS(2)

-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_u),     // %1

-    "+r"(dst_v),     // %2

-    "+r"(pix)        // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

-    "v20", "v21", "v22", "v23", "v24", "v25"

-  );

-}

-#endif  // HAS_ARGBTOUV422ROW_NEON

-// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.

-#ifdef HAS_ARGBTOUV411ROW_NEON

+// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.

 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int pix) {

+                         int width) {

   asm volatile (

     RGBTOUV_SETUP_REG

   "1:                                          \n"

@@ -1616,15 +1439,14 @@

   : "+r"(src_argb),  // %0

     "+r"(dst_u),     // %1

     "+r"(dst_v),     // %2

-    "+r"(pix)        // %3

+    "+r"(width)        // %3

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

     "v20", "v21", "v22", "v23", "v24", "v25"

);

-#endif  // HAS_ARGBTOUV411ROW_NEON

-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

 #define RGBTOUV(QB, QG, QR) \

     "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \

     "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \

@@ -1640,9 +1462,8 @@

 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.

 // TODO(fbarchard): consider ptrdiff_t for all strides.

-#ifdef HAS_ARGBTOUVROW_NEON

 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_argb_1 = src_argb + src_stride_argb;

   asm volatile (

     RGBTOUV_SETUP_REG

@@ -1674,18 +1495,16 @@

     "+r"(src_argb_1),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

     "v20", "v21", "v22", "v23", "v24", "v25"

);

-#endif  // HAS_ARGBTOUVROW_NEON

 // TODO(fbarchard): Subsample match C code.

-#ifdef HAS_ARGBTOUVJROW_NEON

 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int pix) {

+                       uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_argb_1 = src_argb + src_stride_argb;

   asm volatile (

     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2

@@ -1721,17 +1540,15 @@

     "+r"(src_argb_1),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

     "v20", "v21", "v22", "v23", "v24", "v25"

);

-#endif  // HAS_ARGBTOUVJROW_NEON

-#ifdef HAS_BGRATOUVROW_NEON

 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_bgra_1 = src_bgra + src_stride_bgra;

   asm volatile (

     RGBTOUV_SETUP_REG

@@ -1762,17 +1579,15 @@

     "+r"(src_bgra_1),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

     "v20", "v21", "v22", "v23", "v24", "v25"

);

-#endif  // HAS_BGRATOUVROW_NEON

-#ifdef HAS_ABGRTOUVROW_NEON

 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_abgr_1 = src_abgr + src_stride_abgr;

   asm volatile (

     RGBTOUV_SETUP_REG

@@ -1803,17 +1618,15 @@

     "+r"(src_abgr_1),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

     "v20", "v21", "v22", "v23", "v24", "v25"

);

-#endif  // HAS_ABGRTOUVROW_NEON

-#ifdef HAS_RGBATOUVROW_NEON

 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_rgba_1 = src_rgba + src_stride_rgba;

   asm volatile (

     RGBTOUV_SETUP_REG

@@ -1844,17 +1657,15 @@

     "+r"(src_rgba_1),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

     "v20", "v21", "v22", "v23", "v24", "v25"

);

-#endif  // HAS_RGBATOUVROW_NEON

-#ifdef HAS_RGB24TOUVROW_NEON

 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,

-                       uint8* dst_u, uint8* dst_v, int pix) {

+                       uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;

   asm volatile (

     RGBTOUV_SETUP_REG

@@ -1885,17 +1696,15 @@

     "+r"(src_rgb24_1),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

     "v20", "v21", "v22", "v23", "v24", "v25"

);

-#endif  // HAS_RGB24TOUVROW_NEON

-#ifdef HAS_RAWTOUVROW_NEON

 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,

-                     uint8* dst_u, uint8* dst_v, int pix) {

+                     uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_raw_1 = src_raw + src_stride_raw;

   asm volatile (

     RGBTOUV_SETUP_REG

@@ -1926,18 +1735,16 @@

     "+r"(src_raw_1),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

     "v20", "v21", "v22", "v23", "v24", "v25"

);

-#endif  // HAS_RAWTOUVROW_NEON

-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

-#ifdef HAS_RGB565TOUVROW_NEON

+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,

-                        uint8* dst_u, uint8* dst_v, int pix) {

+                        uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;

   asm volatile (

     "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2

@@ -2001,7 +1808,7 @@

     "+r"(src_rgb565_1),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",

@@ -2008,12 +1815,10 @@

     "v25", "v26", "v27"

);

-#endif  // HAS_RGB565TOUVROW_NEON

-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

-#ifdef HAS_ARGB1555TOUVROW_NEON

+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,

-                        uint8* dst_u, uint8* dst_v, int pix) {

+                        uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;

   asm volatile (

     RGBTOUV_SETUP_REG

@@ -2072,7 +1877,7 @@

     "+r"(src_argb1555_1),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",

     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",

@@ -2079,12 +1884,10 @@

     "v26", "v27", "v28"

);

-#endif  // HAS_ARGB1555TOUVROW_NEON

-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

-#ifdef HAS_ARGB4444TOUVROW_NEON

+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,

-                          uint8* dst_u, uint8* dst_v, int pix) {

+                          uint8* dst_u, uint8* dst_v, int width) {

   const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;

   asm volatile (

     RGBTOUV_SETUP_REG

@@ -2143,7 +1946,7 @@

     "+r"(src_argb4444_1),  // %1

     "+r"(dst_u),     // %2

     "+r"(dst_v),     // %3

-    "+r"(pix)        // %4

+    "+r"(width)        // %4

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",

     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",

@@ -2151,10 +1954,8 @@

);

-#endif  // HAS_ARGB4444TOUVROW_NEON

-#ifdef HAS_RGB565TOYROW_NEON

-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {

+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {

   asm volatile (

     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient

     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient

@@ -2175,16 +1976,14 @@

     "b.gt       1b                             \n"

   : "+r"(src_rgb565),  // %0

     "+r"(dst_y),       // %1

-    "+r"(pix)          // %2

+    "+r"(width)          // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",

     "v24", "v25", "v26", "v27"

);

-#endif  // HAS_RGB565TOYROW_NEON

-#ifdef HAS_ARGB1555TOYROW_NEON

-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {

+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {

   asm volatile (

     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

@@ -2205,15 +2004,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb1555),  // %0

     "+r"(dst_y),         // %1

-    "+r"(pix)            // %2

+    "+r"(width)            // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"

);

-#endif  // HAS_ARGB1555TOYROW_NEON

-#ifdef HAS_ARGB4444TOYROW_NEON

-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {

+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {

   asm volatile (

     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient

     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient

@@ -2234,15 +2031,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_argb4444),  // %0

     "+r"(dst_y),         // %1

-    "+r"(pix)            // %2

+    "+r"(width)            // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"

);

-#endif  // HAS_ARGB4444TOYROW_NEON

-#ifdef HAS_BGRATOYROW_NEON

-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {

+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {

   asm volatile (

     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient

     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

@@ -2262,15 +2057,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_bgra),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

);

-#endif  // HAS_BGRATOYROW_NEON

-#ifdef HAS_ABGRTOYROW_NEON

-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {

+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {

   asm volatile (

     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient

     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

@@ -2290,15 +2083,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_abgr),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

);

-#endif  // HAS_ABGRTOYROW_NEON

-#ifdef HAS_RGBATOYROW_NEON

-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {

+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {

   asm volatile (

     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

@@ -2318,15 +2109,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_rgba),  // %0

     "+r"(dst_y),     // %1

-    "+r"(pix)        // %2

+    "+r"(width)        // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

);

-#endif  // HAS_RGBATOYROW_NEON

-#ifdef HAS_RGB24TOYROW_NEON

-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {

+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {

   asm volatile (

     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

@@ -2346,15 +2135,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_rgb24),  // %0

     "+r"(dst_y),      // %1

-    "+r"(pix)         // %2

+    "+r"(width)         // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

);

-#endif  // HAS_RGB24TOYROW_NEON

-#ifdef HAS_RAWTOYROW_NEON

-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {

+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {

   asm volatile (

     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient

     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

@@ -2374,15 +2161,13 @@

     "b.gt       1b                             \n"

   : "+r"(src_raw),  // %0

     "+r"(dst_y),    // %1

-    "+r"(pix)       // %2

+    "+r"(width)       // %2

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

);

-#endif  // HAS_RAWTOYROW_NEON

 // Bilinear filter 16x2 -> 16x1

-#ifdef HAS_INTERPOLATEROW_NEON

 void InterpolateRow_NEON(uint8* dst_ptr,

                          const uint8* src_ptr, ptrdiff_t src_stride,

                          int dst_width, int source_y_fraction) {

@@ -2392,12 +2177,8 @@

   asm volatile (

     "cmp        %w4, #0                        \n"

     "b.eq       100f                           \n"

-    "cmp        %w4, #64                       \n"

-    "b.eq       75f                            \n"

     "cmp        %w4, #128                      \n"

     "b.eq       50f                            \n"

-    "cmp        %w4, #192                      \n"

-    "b.eq       25f                            \n"

     "dup        v5.16b, %w4                    \n"

     "dup        v4.16b, %w5                    \n"

@@ -2419,20 +2200,6 @@

     "b.gt       1b                             \n"

     "b          99f                            \n"

-    // Blend 25 / 75.

-  "25:                                         \n"

-    MEMACCESS(1)

-    "ld1        {v0.16b}, [%1], #16            \n"

-    MEMACCESS(2)

-    "ld1        {v1.16b}, [%2], #16            \n"

-    "subs       %w3, %w3, #16                  \n"

-    "urhadd     v0.16b, v0.16b, v1.16b         \n"

-    "urhadd     v0.16b, v0.16b, v1.16b         \n"

-    MEMACCESS(0)

-    "st1        {v0.16b}, [%0], #16            \n"

-    "b.gt       25b                            \n"

-    "b          99f                            \n"

     // Blend 50 / 50.

   "50:                                         \n"

     MEMACCESS(1)

@@ -2446,20 +2213,6 @@

     "b.gt       50b                            \n"

     "b          99f                            \n"

-    // Blend 75 / 25.

-  "75:                                         \n"

-    MEMACCESS(1)

-    "ld1        {v1.16b}, [%1], #16            \n"

-    MEMACCESS(2)

-    "ld1        {v0.16b}, [%2], #16            \n"

-    "subs       %w3, %w3, #16                  \n"

-    "urhadd     v0.16b, v0.16b, v1.16b         \n"

-    "urhadd     v0.16b, v0.16b, v1.16b         \n"

-    MEMACCESS(0)

-    "st1        {v0.16b}, [%0], #16            \n"

-    "b.gt       75b                            \n"

-    "b          99f                            \n"

     // Blend 100 / 0 - Copy row unchanged.

   "100:                                        \n"

     MEMACCESS(1)

@@ -2480,10 +2233,8 @@

   : "cc", "memory", "v0", "v1", "v3", "v4", "v5"

);

-#endif  // HAS_INTERPOLATEROW_NEON

 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr

-#ifdef HAS_ARGBBLENDROW_NEON

 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

                        uint8* dst_argb, int width) {

   asm volatile (

@@ -2552,10 +2303,8 @@

     "v16", "v17", "v18"

);

-#endif  // HAS_ARGBBLENDROW_NEON

 // Attenuate 8 pixels at a time.

-#ifdef HAS_ARGBATTENUATEROW_NEON

 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {

   asm volatile (

     // Attenuate 8 pixels.

@@ -2579,11 +2328,9 @@

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"

);

-#endif  // HAS_ARGBATTENUATEROW_NEON

 // Quantize 8 ARGB pixels (32 bytes).

 // dst = (dst * scale >> 16) * interval_size + interval_offset;

-#ifdef HAS_ARGBQUANTIZEROW_NEON

 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,

                           int interval_offset, int width) {

   asm volatile (

@@ -2623,12 +2370,10 @@

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"

);

-#endif  // HAS_ARGBQUANTIZEROW_NEON

 // Shade 8 pixels at a time by specified value.

 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.

 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.

-#ifdef HAS_ARGBSHADEROW_NEON

 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,

                        uint32 value) {

   asm volatile (

@@ -2663,12 +2408,10 @@

   : "cc", "memory", "v0", "v4", "v5", "v6", "v7"

);

-#endif  // HAS_ARGBSHADEROW_NEON

 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels

 // Similar to ARGBToYJ but stores ARGB.

 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;

-#ifdef HAS_ARGBGRAYROW_NEON

 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {

   asm volatile (

     "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient

@@ -2694,7 +2437,6 @@

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"

);

-#endif  // HAS_ARGBGRAYROW_NEON

 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.

 //    b = (r * 35 + g * 68 + b * 17) >> 7

@@ -2701,7 +2443,6 @@

 //    g = (r * 45 + g * 88 + b * 22) >> 7

 //    r = (r * 50 + g * 98 + b * 24) >> 7

-#ifdef HAS_ARGBSEPIAROW_NEON

 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {

   asm volatile (

     "movi       v20.8b, #17                    \n"  // BB coefficient

@@ -2739,12 +2480,10 @@

     "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"

);

-#endif  // HAS_ARGBSEPIAROW_NEON

 // Tranform 8 ARGB pixels (32 bytes) with color matrix.

 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function

 // needs to saturate.  Consider doing a non-saturating version.

-#ifdef HAS_ARGBCOLORMATRIXROW_NEON

 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,

                              const int8* matrix_argb, int width) {

   asm volatile (

@@ -2804,11 +2543,9 @@

     "v18", "v19", "v22", "v23", "v24", "v25"

);

-#endif  // HAS_ARGBCOLORMATRIXROW_NEON

 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.

 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.

-#ifdef HAS_ARGBMULTIPLYROW_NEON

 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

                           uint8* dst_argb, int width) {

   asm volatile (

@@ -2839,10 +2576,8 @@

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"

);

-#endif  // HAS_ARGBMULTIPLYROW_NEON

 // Add 2 rows of ARGB pixels together, 8 pixels at a time.

-#ifdef HAS_ARGBADDROW_NEON

 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

                      uint8* dst_argb, int width) {

   asm volatile (

@@ -2869,10 +2604,8 @@

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"

);

-#endif  // HAS_ARGBADDROW_NEON

 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.

-#ifdef HAS_ARGBSUBTRACTROW_NEON

 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

                           uint8* dst_argb, int width) {

   asm volatile (

@@ -2899,7 +2632,6 @@

   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"

);

-#endif  // HAS_ARGBSUBTRACTROW_NEON

 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.

 // A = 255

@@ -2906,7 +2638,6 @@

 // R = Sobel

 // G = Sobel

 // B = Sobel

-#ifdef HAS_SOBELROW_NEON

 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

                      uint8* dst_argb, int width) {

   asm volatile (

@@ -2932,10 +2663,8 @@

   : "cc", "memory", "v0", "v1", "v2", "v3"

);

-#endif  // HAS_SOBELROW_NEON

 // Adds Sobel X and Sobel Y and stores Sobel into plane.

-#ifdef HAS_SOBELTOPLANEROW_NEON

 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

                           uint8* dst_y, int width) {

   asm volatile (

@@ -2958,7 +2687,6 @@

   : "cc", "memory", "v0", "v1"

);

-#endif  // HAS_SOBELTOPLANEROW_NEON

 // Mixes Sobel X, Sobel Y and Sobel into ARGB.

 // A = 255

@@ -2965,7 +2693,6 @@

 // R = Sobel X

 // G = Sobel

 // B = Sobel Y

-#ifdef HAS_SOBELXYROW_NEON

 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

                      uint8* dst_argb, int width) {

   asm volatile (

@@ -2989,13 +2716,11 @@

   : "cc", "memory", "v0", "v1", "v2", "v3"

);

-#endif  // HAS_SOBELXYROW_NEON

 // SobelX as a matrix is

 // -1  0  1

 // -2  0  2

 // -1  0  1

-#ifdef HAS_SOBELXROW_NEON

 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,

                     const uint8* src_y2, uint8* dst_sobelx, int width) {

   asm volatile (

@@ -3034,13 +2759,11 @@

   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

);

-#endif  // HAS_SOBELXROW_NEON

 // SobelY as a matrix is

 // -1 -2 -1

 //  0  0  0

 //  1  2  1

-#ifdef HAS_SOBELYROW_NEON

 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,

                     uint8* dst_sobely, int width) {

   asm volatile (

@@ -3078,7 +2801,6 @@

   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

);

-#endif  // HAS_SOBELYROW_NEON

 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

 #ifdef __cplusplus

--- a/third_party/libyuv/source/row_win.cc

+++ b/third_party/libyuv/source/row_win.cc

@@ -21,183 +21,108 @@

 extern "C" {

 #endif

-// This module is for Visual C.

-#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \

-    defined(_MSC_VER) && !defined(__clang__)

+// This module is for Visual C 32/64 bit and clangcl 32 bit

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))

-struct YuvConstants {

-  lvec8 kUVToB;     // 0

-  lvec8 kUVToG;     // 32

-  lvec8 kUVToR;     // 64

-  lvec16 kUVBiasB;  // 96

-  lvec16 kUVBiasG;  // 128

-  lvec16 kUVBiasR;  // 160

-  lvec16 kYToRgb;   // 192

-};

+// 64 bit

+#if defined(_M_X64)

-// BT.601 YUV to RGB reference

-//  R = (Y - 16) * 1.164              - V * -1.596

-//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813

-//  B = (Y - 16) * 1.164 - U * -2.018

+// Read 4 UV from 422, upsample to 8 UV.

+#define READYUV422                                                             \

+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \

+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \

+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \

+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \

+    u_buf += 4;                                                                \

+    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \

+    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \

+    y_buf += 8;

-// Y contribution to R,G,B.  Scale and bias.

-// TODO(fbarchard): Consider moving constants into a common header.

-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.

+#define READYUVA422                                                            \

+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \

+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \

+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \

+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \

+    u_buf += 4;                                                                \

+    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \

+    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \

+    y_buf += 8;                                                                \

+    xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \

+    a_buf += 8;

-// U and V contributions to R,G,B.

-#define UB -128 /* max(-128, round(-2.018 * 64)) */

-#define UG 25 /* round(0.391 * 64) */

-#define VG 52 /* round(0.813 * 64) */

-#define VR -102 /* round(-1.596 * 64) */

+// Convert 8 pixels: 8 UV and 8 Y.

+#define YUVTORGB(yuvconstants)                                                 \

+    xmm1 = _mm_loadu_si128(&xmm0);                                             \

+    xmm2 = _mm_loadu_si128(&xmm0);                                             \

+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \

+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \

+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \

+    xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \

+    xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \

+    xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \

+    xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \

+    xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \

+    xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \

+    xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \

+    xmm0 = _mm_srai_epi16(xmm0, 6);                                            \

+    xmm1 = _mm_srai_epi16(xmm1, 6);                                            \

+    xmm2 = _mm_srai_epi16(xmm2, 6);                                            \

+    xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \

+    xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \

+    xmm2 = _mm_packus_epi16(xmm2, xmm2);

-// Bias values to subtract 16 from Y and 128 from U and V.

-#define BB (UB * 128            + YGB)

-#define BG (UG * 128 + VG * 128 + YGB)

-#define BR            (VR * 128 + YGB)

+// Store 8 ARGB values.

+#define STOREARGB                                                              \

+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \

+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \

+    xmm1 = _mm_loadu_si128(&xmm0);                                             \

+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \

+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \

+    _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \

+    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \

+    dst_argb += 32;

-// BT601 constants for YUV to RGB.

-static YuvConstants SIMD_ALIGNED(kYuvConstants) = {

-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },

-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },

-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

-};

-// BT601 constants for NV21 where chroma plane is VU instead of UV.

-static YuvConstants SIMD_ALIGNED(kYvuConstants) = {

-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },

-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },

-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

-};

-#undef YG

-#undef YGB

-#undef UB

-#undef UG

-#undef VG

-#undef VR

-#undef BB

-#undef BG

-#undef BR

-// JPEG YUV to RGB reference

-// *  R = Y                - V * -1.40200

-// *  G = Y - U *  0.34414 - V *  0.71414

-// *  B = Y - U * -1.77200

-// Y contribution to R,G,B.  Scale and bias.

-// TODO(fbarchard): Consider moving constants into a common header.

-#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */

-#define YGBJ 32  /* 64 / 2 */

-// U and V contributions to R,G,B.

-#define UBJ -113 /* round(-1.77200 * 64) */

-#define UGJ 22 /* round(0.34414 * 64) */

-#define VGJ 46 /* round(0.71414  * 64) */

-#define VRJ -90 /* round(-1.40200 * 64) */

-// Bias values to subtract 16 from Y and 128 from U and V.

-#define BBJ (UBJ * 128             + YGBJ)

-#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)

-#define BRJ             (VRJ * 128 + YGBJ)

-// JPEG constants for YUV to RGB.

-static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {

-  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,

-    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },

-  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },

-  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,

-    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },

-  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,

-    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },

-  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,

-    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },

-  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,

-    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },

-  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,

-    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }

-};

-#undef YGJ

-#undef YGBJ

-#undef UBJ

-#undef UGJ

-#undef VGJ

-#undef VRJ

-#undef BBJ

-#undef BGJ

-#undef BRJ

-// 64 bit

-#if defined(_M_X64)

 #if defined(HAS_I422TOARGBROW_SSSE3)

 void I422ToARGBRow_SSSE3(const uint8* y_buf,

                          const uint8* u_buf,

                          const uint8* v_buf,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

-  __m128i xmm0, xmm1, xmm2, xmm3;

+  __m128i xmm0, xmm1, xmm2, xmm4;

   const __m128i xmm5 = _mm_set1_epi8(-1);

   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;

   while (width > 0) {

-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);

-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));

-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);

-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);

-    xmm1 = _mm_loadu_si128(&xmm0);

-    xmm2 = _mm_loadu_si128(&xmm0);

-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);

-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);

-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);

-    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);

-    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);

-    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);

-    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);

-    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);

-    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);

-    xmm0 = _mm_adds_epi16(xmm0, xmm3);

-    xmm1 = _mm_adds_epi16(xmm1, xmm3);

-    xmm2 = _mm_adds_epi16(xmm2, xmm3);

-    xmm0 = _mm_srai_epi16(xmm0, 6);

-    xmm1 = _mm_srai_epi16(xmm1, 6);

-    xmm2 = _mm_srai_epi16(xmm2, 6);

-    xmm0 = _mm_packus_epi16(xmm0, xmm0);

-    xmm1 = _mm_packus_epi16(xmm1, xmm1);

-    xmm2 = _mm_packus_epi16(xmm2, xmm2);

-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);

-    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);

-    xmm1 = _mm_loadu_si128(&xmm0);

-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);

-    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);

+    READYUV422

+    YUVTORGB(yuvconstants)

+    STOREARGB

+    width -= 8;

+  }

+}

+#endif

-    _mm_storeu_si128((__m128i *)dst_argb, xmm0);

-    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);

-    y_buf += 8;

-    u_buf += 4;

-    dst_argb += 32;

+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)

+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,

+                              const uint8* u_buf,

+                              const uint8* v_buf,

+                              const uint8* a_buf,

+                              uint8* dst_argb,

+                              const struct YuvConstants* yuvconstants,

+                              int width) {

+  __m128i xmm0, xmm1, xmm2, xmm4, xmm5;

+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;

+  while (width > 0) {

+    READYUVA422

+    YUVTORGB(yuvconstants)

+    STOREARGB

     width -= 8;

 #endif

 // 32 bit

 #else  // defined(_M_X64)

 #ifdef HAS_ARGBTOYROW_SSSE3

@@ -301,6 +226,24 @@

   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u

};

+// Shuffle table for converting RAW to RGB24.  First 8.

+static const uvec8 kShuffleMaskRAWToRGB24_0 = {

+  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,

+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

+};

+// Shuffle table for converting RAW to RGB24.  Middle 8.

+static const uvec8 kShuffleMaskRAWToRGB24_1 = {

+  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,

+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

+};

+// Shuffle table for converting RAW to RGB24.  Last 8.

+static const uvec8 kShuffleMaskRAWToRGB24_2 = {

+  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,

+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

+};

 // Shuffle table for converting ARGB to RGB24.

 static const uvec8 kShuffleMaskARGBToRGB24 = {

   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u

@@ -316,18 +259,43 @@

   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u

};

-// Shuffle table for converting ARGB to RAW.

-static const uvec8 kShuffleMaskARGBToRAW_0 = {

-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u

+// YUY2 shuf 16 Y to 32 Y.

+static const lvec8 kShuffleYUY2Y = {

+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,

+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14

};

+// YUY2 shuf 8 UV to 16 UV.

+static const lvec8 kShuffleYUY2UV = {

+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,

+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15

+};

+// UYVY shuf 16 Y to 32 Y.

+static const lvec8 kShuffleUYVYY = {

+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,

+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15

+};

+// UYVY shuf 8 UV to 16 UV.

+static const lvec8 kShuffleUYVYUV = {

+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,

+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14

+};

+// NV21 shuf 8 VU to 16 UV.

+static const lvec8 kShuffleNV21 = {

+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

+};

 // Duplicates gray value 3 times and fills in alpha opaque.

 __declspec(naked)

-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {

+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {

   __asm {

     mov        eax, [esp + 4]        // src_y

     mov        edx, [esp + 8]        // dst_argb

-    mov        ecx, [esp + 12]       // pix

+    mov        ecx, [esp + 12]       // width

     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000

     pslld      xmm5, 24

@@ -352,11 +320,11 @@

 #ifdef HAS_J400TOARGBROW_AVX2

 // Duplicates gray value 3 times and fills in alpha opaque.

 __declspec(naked)

-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {

+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {

   __asm {

     mov         eax, [esp + 4]        // src_y

     mov         edx, [esp + 8]        // dst_argb

-    mov         ecx, [esp + 12]       // pix

+    mov         ecx, [esp + 12]       // width

     vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000

     vpslld      ymm5, ymm5, 24

@@ -382,14 +350,14 @@

 #endif  // HAS_J400TOARGBROW_AVX2

 __declspec(naked)

-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {

+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {

   __asm {

     mov       eax, [esp + 4]   // src_rgb24

     mov       edx, [esp + 8]   // dst_argb

-    mov       ecx, [esp + 12]  // pix

+    mov       ecx, [esp + 12]  // width

     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000

     pslld     xmm5, 24

-    movdqa    xmm4, kShuffleMaskRGB24ToARGB

+    movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB

  convertloop:

     movdqu    xmm0, [eax]

@@ -421,14 +389,14 @@

 __declspec(naked)

 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,

-                        int pix) {

+                        int width) {

   __asm {

     mov       eax, [esp + 4]   // src_raw

     mov       edx, [esp + 8]   // dst_argb

-    mov       ecx, [esp + 12]  // pix

+    mov       ecx, [esp + 12]  // width

     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000

     pslld     xmm5, 24

-    movdqa    xmm4, kShuffleMaskRAWToARGB

+    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB

  convertloop:

     movdqu    xmm0, [eax]

@@ -458,6 +426,34 @@

+__declspec(naked)

+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {

+  __asm {

+    mov       eax, [esp + 4]   // src_raw

+    mov       edx, [esp + 8]   // dst_rgb24

+    mov       ecx, [esp + 12]  // width

+    movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0

+    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1

+    movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2

+ convertloop:

+    movdqu    xmm0, [eax]

+    movdqu    xmm1, [eax + 4]

+    movdqu    xmm2, [eax + 8]

+    lea       eax, [eax + 24]

+    pshufb    xmm0, xmm3

+    pshufb    xmm1, xmm4

+    pshufb    xmm2, xmm5

+    movq      qword ptr [edx], xmm0

+    movq      qword ptr [edx + 8], xmm1

+    movq      qword ptr [edx + 16], xmm2

+    lea       edx, [edx + 24]

+    sub       ecx, 8

+    jg        convertloop

+    ret

+  }

+}

 // pmul method to replicate bits.

 // Math to replicate bits:

 // (v << 8) | (v << 3)

@@ -467,7 +463,7 @@

 // 20 instructions.

 __declspec(naked)

 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,

-                          int pix) {

+                          int width) {

   __asm {

     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits

     movd      xmm5, eax

@@ -485,7 +481,7 @@

     mov       eax, [esp + 4]   // src_rgb565

     mov       edx, [esp + 8]   // dst_argb

-    mov       ecx, [esp + 12]  // pix

+    mov       ecx, [esp + 12]  // width

     sub       edx, eax

     sub       edx, eax

@@ -523,13 +519,13 @@

 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3

 __declspec(naked)

 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,

-                          int pix) {

+                          int width) {

   __asm {

     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits

     vmovd      xmm5, eax

     vbroadcastss ymm5, xmm5

     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits

-    movd       xmm6, eax

+    vmovd      xmm6, eax

     vbroadcastss ymm6, xmm6

     vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red

     vpsllw     ymm3, ymm3, 11

@@ -541,7 +537,7 @@

     mov        eax, [esp + 4]   // src_rgb565

     mov        edx, [esp + 8]   // dst_argb

-    mov        ecx, [esp + 12]  // pix

+    mov        ecx, [esp + 12]  // width

     sub        edx, eax

     sub        edx, eax

@@ -574,13 +570,13 @@

 #ifdef HAS_ARGB1555TOARGBROW_AVX2

 __declspec(naked)

 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,

-                            int pix) {

+                            int width) {

   __asm {

     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits

     vmovd      xmm5, eax

     vbroadcastss ymm5, xmm5

     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits

-    movd       xmm6, eax

+    vmovd      xmm6, eax

     vbroadcastss ymm6, xmm6

     vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red

     vpsllw     ymm3, ymm3, 11

@@ -590,7 +586,7 @@

     mov        eax,  [esp + 4]   // src_argb1555

     mov        edx,  [esp + 8]   // dst_argb

-    mov        ecx,  [esp + 12]  // pix

+    mov        ecx,  [esp + 12]  // width

     sub        edx,  eax

     sub        edx,  eax

@@ -626,7 +622,7 @@

 #ifdef HAS_ARGB4444TOARGBROW_AVX2

 __declspec(naked)

 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,

-                            int pix) {

+                            int width) {

   __asm {

     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f

     vmovd     xmm4, eax

@@ -634,7 +630,7 @@

     vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles

     mov       eax,  [esp + 4]   // src_argb4444

     mov       edx,  [esp + 8]   // dst_argb

-    mov       ecx,  [esp + 12]  // pix

+    mov       ecx,  [esp + 12]  // width

     sub       edx,  eax

     sub       edx,  eax

@@ -664,7 +660,7 @@

 // 24 instructions

 __declspec(naked)

 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,

-                            int pix) {

+                            int width) {

   __asm {

     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits

     movd      xmm5, eax

@@ -681,7 +677,7 @@

     mov       eax, [esp + 4]   // src_argb1555

     mov       edx, [esp + 8]   // dst_argb

-    mov       ecx, [esp + 12]  // pix

+    mov       ecx, [esp + 12]  // width

     sub       edx, eax

     sub       edx, eax

@@ -717,7 +713,7 @@

 // 18 instructions.

 __declspec(naked)

 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,

-                            int pix) {

+                            int width) {

   __asm {

     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f

     movd      xmm4, eax

@@ -726,7 +722,7 @@

     pslld     xmm5, 4

     mov       eax, [esp + 4]   // src_argb4444

     mov       edx, [esp + 8]   // dst_argb

-    mov       ecx, [esp + 12]  // pix

+    mov       ecx, [esp + 12]  // width

     sub       edx, eax

     sub       edx, eax

@@ -754,12 +750,12 @@

 __declspec(naked)

-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {

+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {

   __asm {

     mov       eax, [esp + 4]   // src_argb

     mov       edx, [esp + 8]   // dst_rgb

-    mov       ecx, [esp + 12]  // pix

-    movdqa    xmm6, kShuffleMaskARGBToRGB24

+    mov       ecx, [esp + 12]  // width

+    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24

  convertloop:

     movdqu    xmm0, [eax]   // fetch 16 pixels of argb

@@ -792,12 +788,12 @@

 __declspec(naked)

-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {

+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {

   __asm {

     mov       eax, [esp + 4]   // src_argb

     mov       edx, [esp + 8]   // dst_rgb

-    mov       ecx, [esp + 12]  // pix

-    movdqa    xmm6, kShuffleMaskARGBToRAW

+    mov       ecx, [esp + 12]  // width

+    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW

  convertloop:

     movdqu    xmm0, [eax]   // fetch 16 pixels of argb

@@ -829,13 +825,12 @@

-// 4 pixels

 __declspec(naked)

-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {

+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {

   __asm {

     mov       eax, [esp + 4]   // src_argb

     mov       edx, [esp + 8]   // dst_rgb

-    mov       ecx, [esp + 12]  // pix

+    mov       ecx, [esp + 12]  // width

     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f

     psrld     xmm3, 27

     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0

@@ -867,16 +862,15 @@

-// 8 pixels

 __declspec(naked)

 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int pix) {

+                                const uint32 dither4, int width) {

   __asm {

     mov       eax, [esp + 4]   // src_argb

     mov       edx, [esp + 8]   // dst_rgb

     movd      xmm6, [esp + 12] // dither4

-    mov       ecx, [esp + 16]  // pix

+    mov       ecx, [esp + 16]  // width

     punpcklbw xmm6, xmm6       // make dither 16 bytes

     movdqa    xmm7, xmm6

     punpcklwd xmm6, xmm6

@@ -916,12 +910,12 @@

 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2

 __declspec(naked)

 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int pix) {

+                                const uint32 dither4, int width) {

   __asm {

     mov        eax, [esp + 4]      // src_argb

     mov        edx, [esp + 8]      // dst_rgb

     vbroadcastss xmm6, [esp + 12]  // dither4

-    mov        ecx, [esp + 16]     // pix

+    mov        ecx, [esp + 16]     // width

     vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes

     vpermq     ymm6, ymm6, 0xd8

     vpunpcklwd ymm6, ymm6, ymm6

@@ -958,11 +952,11 @@

 // TODO(fbarchard): Improve sign extension/packing.

 __declspec(naked)

-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {

+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {

   __asm {

     mov       eax, [esp + 4]   // src_argb

     mov       edx, [esp + 8]   // dst_rgb

-    mov       ecx, [esp + 12]  // pix

+    mov       ecx, [esp + 12]  // width

     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f

     psrld     xmm4, 27

     movdqa    xmm5, xmm4       // generate mask 0x000003e0

@@ -999,11 +993,11 @@

 __declspec(naked)

-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {

+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {

   __asm {

     mov       eax, [esp + 4]   // src_argb

     mov       edx, [esp + 8]   // dst_rgb

-    mov       ecx, [esp + 12]  // pix

+    mov       ecx, [esp + 12]  // width

     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000

     psllw     xmm4, 12

     movdqa    xmm3, xmm4       // generate mask 0x00f000f0

@@ -1029,11 +1023,11 @@

 #ifdef HAS_ARGBTORGB565ROW_AVX2

 __declspec(naked)

-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {

+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {

   __asm {

     mov        eax, [esp + 4]      // src_argb

     mov        edx, [esp + 8]      // dst_rgb

-    mov        ecx, [esp + 12]     // pix

+    mov        ecx, [esp + 12]     // width

     vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f

     vpsrld     ymm3, ymm3, 27

     vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0

@@ -1066,11 +1060,11 @@

 #ifdef HAS_ARGBTOARGB1555ROW_AVX2

 __declspec(naked)

-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {

+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {

   __asm {

     mov        eax, [esp + 4]      // src_argb

     mov        edx, [esp + 8]      // dst_rgb

-    mov        ecx, [esp + 12]     // pix

+    mov        ecx, [esp + 12]     // width

     vpcmpeqb   ymm4, ymm4, ymm4

     vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f

     vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0

@@ -1106,11 +1100,11 @@

 #ifdef HAS_ARGBTOARGB4444ROW_AVX2

 __declspec(naked)

-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {

+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {

   __asm {

     mov        eax, [esp + 4]   // src_argb

     mov        edx, [esp + 8]   // dst_rgb

-    mov        ecx, [esp + 12]  // pix

+    mov        ecx, [esp + 12]  // width

     vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000

     vpsllw     ymm4, ymm4, 12

     vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0

@@ -1137,13 +1131,13 @@

 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.

 __declspec(naked)

-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]   /* src_argb */

     mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* pix */

-    movdqa     xmm4, kARGBToY

-    movdqa     xmm5, kAddY16

+    mov        ecx, [esp + 12]  /* width */

+    movdqa     xmm4, xmmword ptr kARGBToY

+    movdqa     xmm5, xmmword ptr kAddY16

  convertloop:

     movdqu     xmm0, [eax]

@@ -1172,13 +1166,13 @@

 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.

 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.

 __declspec(naked)

-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]   /* src_argb */

     mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* pix */

-    movdqa     xmm4, kARGBToYJ

-    movdqa     xmm5, kAddYJ64

+    mov        ecx, [esp + 12]  /* width */

+    movdqa     xmm4, xmmword ptr kARGBToYJ

+    movdqa     xmm5, xmmword ptr kAddYJ64

  convertloop:

     movdqu     xmm0, [eax]

@@ -1213,14 +1207,14 @@

 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.

 __declspec(naked)

-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]   /* src_argb */

     mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* pix */

-    vbroadcastf128 ymm4, kARGBToY

-    vbroadcastf128 ymm5, kAddY16

-    vmovdqu    ymm6, kPermdARGBToY_AVX

+    mov        ecx, [esp + 12]  /* width */

+    vbroadcastf128 ymm4, xmmword ptr kARGBToY

+    vbroadcastf128 ymm5, xmmword ptr kAddY16

+    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX

  convertloop:

     vmovdqu    ymm0, [eax]

@@ -1252,14 +1246,14 @@

 #ifdef HAS_ARGBTOYJROW_AVX2

 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.

 __declspec(naked)

-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {

+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]   /* src_argb */

     mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* pix */

-    vbroadcastf128 ymm4, kARGBToYJ

-    vbroadcastf128 ymm5, kAddYJ64

-    vmovdqu    ymm6, kPermdARGBToY_AVX

+    mov        ecx, [esp + 12]  /* width */

+    vbroadcastf128 ymm4, xmmword ptr kARGBToYJ

+    vbroadcastf128 ymm5, xmmword ptr kAddYJ64

+    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX

  convertloop:

     vmovdqu    ymm0, [eax]

@@ -1291,13 +1285,13 @@

 #endif  //  HAS_ARGBTOYJROW_AVX2

 __declspec(naked)

-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]   /* src_argb */

     mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* pix */

-    movdqa     xmm4, kBGRAToY

-    movdqa     xmm5, kAddY16

+    mov        ecx, [esp + 12]  /* width */

+    movdqa     xmm4, xmmword ptr kBGRAToY

+    movdqa     xmm5, xmmword ptr kAddY16

  convertloop:

     movdqu     xmm0, [eax]

@@ -1324,13 +1318,13 @@

 __declspec(naked)

-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]   /* src_argb */

     mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* pix */

-    movdqa     xmm4, kABGRToY

-    movdqa     xmm5, kAddY16

+    mov        ecx, [esp + 12]  /* width */

+    movdqa     xmm4, xmmword ptr kABGRToY

+    movdqa     xmm5, xmmword ptr kAddY16

  convertloop:

     movdqu     xmm0, [eax]

@@ -1357,13 +1351,13 @@

 __declspec(naked)

-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]   /* src_argb */

     mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* pix */

-    movdqa     xmm4, kRGBAToY

-    movdqa     xmm5, kAddY16

+    mov        ecx, [esp + 12]  /* width */

+    movdqa     xmm4, xmmword ptr kRGBAToY

+    movdqa     xmm5, xmmword ptr kAddY16

  convertloop:

     movdqu     xmm0, [eax]

@@ -1399,10 +1393,10 @@

     mov        esi, [esp + 8 + 8]   // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

-    mov        ecx, [esp + 8 + 20]  // pix

-    movdqa     xmm5, kAddUV128

-    movdqa     xmm6, kARGBToV

-    movdqa     xmm7, kARGBToU

+    mov        ecx, [esp + 8 + 20]  // width

+    movdqa     xmm5, xmmword ptr kAddUV128

+    movdqa     xmm6, xmmword ptr kARGBToV

+    movdqa     xmm7, xmmword ptr kARGBToU

     sub        edi, edx             // stride from u to v

  convertloop:

@@ -1469,10 +1463,10 @@

     mov        esi, [esp + 8 + 8]   // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

-    mov        ecx, [esp + 8 + 20]  // pix

-    movdqa     xmm5, kAddUVJ128

-    movdqa     xmm6, kARGBToVJ

-    movdqa     xmm7, kARGBToUJ

+    mov        ecx, [esp + 8 + 20]  // width

+    movdqa     xmm5, xmmword ptr kAddUVJ128

+    movdqa     xmm6, xmmword ptr kARGBToVJ

+    movdqa     xmm7, xmmword ptr kARGBToUJ

     sub        edi, edx             // stride from u to v

  convertloop:

@@ -1511,7 +1505,7 @@

     pmaddubsw  xmm3, xmm6

     phaddw     xmm0, xmm2

     phaddw     xmm1, xmm3

-    paddw      xmm0, xmm5            // +.5 rounding -> unsigned

+    paddw      xmm0, xmm5  // +.5 rounding -> unsigned

     paddw      xmm1, xmm5

     psraw      xmm0, 8

     psraw      xmm1, 8

@@ -1541,10 +1535,10 @@

     mov        esi, [esp + 8 + 8]   // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

-    mov        ecx, [esp + 8 + 20]  // pix

-    vbroadcastf128 ymm5, kAddUV128

-    vbroadcastf128 ymm6, kARGBToV

-    vbroadcastf128 ymm7, kARGBToU

+    mov        ecx, [esp + 8 + 20]  // width

+    vbroadcastf128 ymm5, xmmword ptr kAddUV128

+    vbroadcastf128 ymm6, xmmword ptr kARGBToV

+    vbroadcastf128 ymm7, xmmword ptr kARGBToU

     sub        edi, edx             // stride from u to v

  convertloop:

@@ -1578,7 +1572,7 @@

     vpsraw     ymm0, ymm0, 8

     vpacksswb  ymm0, ymm1, ymm0  // mutates

     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb

-    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw

+    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw

     vpaddb     ymm0, ymm0, ymm5  // -> unsigned

     // step 3 - store 16 U and 16 V values

@@ -1596,7 +1590,74 @@

 #endif  // HAS_ARGBTOUVROW_AVX2

+#ifdef HAS_ARGBTOUVJROW_AVX2

 __declspec(naked)

+void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,

+                      uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // width

+    vbroadcastf128 ymm5, xmmword ptr kAddUV128

+    vbroadcastf128 ymm6, xmmword ptr kARGBToV

+    vbroadcastf128 ymm7, xmmword ptr kARGBToU

+    sub        edi, edx             // stride from u to v

+ convertloop:

+    /* step 1 - subsample 32x2 argb pixels to 16x1 */

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    vmovdqu    ymm2, [eax + 64]

+    vmovdqu    ymm3, [eax + 96]

+    vpavgb     ymm0, ymm0, [eax + esi]

+    vpavgb     ymm1, ymm1, [eax + esi + 32]

+    vpavgb     ymm2, ymm2, [eax + esi + 64]

+    vpavgb     ymm3, ymm3, [eax + esi + 96]

+    lea        eax,  [eax + 128]

+    vshufps    ymm4, ymm0, ymm1, 0x88

+    vshufps    ymm0, ymm0, ymm1, 0xdd

+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps

+    vshufps    ymm4, ymm2, ymm3, 0x88

+    vshufps    ymm2, ymm2, ymm3, 0xdd

+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 32 different pixels, its 16 pixels of U and 16 of V

+    vpmaddubsw ymm1, ymm0, ymm7  // U

+    vpmaddubsw ymm3, ymm2, ymm7

+    vpmaddubsw ymm0, ymm0, ymm6  // V

+    vpmaddubsw ymm2, ymm2, ymm6

+    vphaddw    ymm1, ymm1, ymm3  // mutates

+    vphaddw    ymm0, ymm0, ymm2

+    vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned

+    vpaddw     ymm0, ymm0, ymm5

+    vpsraw     ymm1, ymm1, 8

+    vpsraw     ymm0, ymm0, 8

+    vpacksswb  ymm0, ymm1, ymm0  // mutates

+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb

+    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw

+    // step 3 - store 16 U and 16 V values

+    vextractf128 [edx], ymm0, 0 // U

+    vextractf128 [edx + edi], ymm0, 1 // V

+    lea        edx, [edx + 16]

+    sub        ecx, 32

+    jg         convertloop

+    pop        edi

+    pop        esi

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBTOUVJROW_AVX2

+__declspec(naked)

 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,

                           uint8* dst_u, uint8* dst_v, int width) {

   __asm {

@@ -1604,10 +1665,10 @@

     mov        eax, [esp + 4 + 4]   // src_argb

     mov        edx, [esp + 4 + 8]   // dst_u

     mov        edi, [esp + 4 + 12]  // dst_v

-    mov        ecx, [esp + 4 + 16]  // pix

-    movdqa     xmm5, kAddUV128

-    movdqa     xmm6, kARGBToV

-    movdqa     xmm7, kARGBToU

+    mov        ecx, [esp + 4 + 16]  // width

+    movdqa     xmm5, xmmword ptr kAddUV128

+    movdqa     xmm6, xmmword ptr kARGBToV

+    movdqa     xmm7, xmmword ptr kARGBToU

     sub        edi, edx             // stride from u to v

  convertloop:

@@ -1654,64 +1715,6 @@

 __declspec(naked)

-void ARGBToUV422Row_SSSE3(const uint8* src_argb0,

-                          uint8* dst_u, uint8* dst_v, int width) {

-  __asm {

-    push       edi

-    mov        eax, [esp + 4 + 4]   // src_argb

-    mov        edx, [esp + 4 + 8]   // dst_u

-    mov        edi, [esp + 4 + 12]  // dst_v

-    mov        ecx, [esp + 4 + 16]  // pix

-    movdqa     xmm5, kAddUV128

-    movdqa     xmm6, kARGBToV

-    movdqa     xmm7, kARGBToU

-    sub        edi, edx             // stride from u to v

- convertloop:

-    /* step 1 - subsample 16x2 argb pixels to 8x1 */

-    movdqu     xmm0, [eax]

-    movdqu     xmm1, [eax + 16]

-    movdqu     xmm2, [eax + 32]

-    movdqu     xmm3, [eax + 48]

-    lea        eax,  [eax + 64]

-    movdqa     xmm4, xmm0

-    shufps     xmm0, xmm1, 0x88

-    shufps     xmm4, xmm1, 0xdd

-    pavgb      xmm0, xmm4

-    movdqa     xmm4, xmm2

-    shufps     xmm2, xmm3, 0x88

-    shufps     xmm4, xmm3, 0xdd

-    pavgb      xmm2, xmm4

-    // step 2 - convert to U and V

-    // from here down is very similar to Y code except

-    // instead of 16 different pixels, its 8 pixels of U and 8 of V

-    movdqa     xmm1, xmm0

-    movdqa     xmm3, xmm2

-    pmaddubsw  xmm0, xmm7  // U

-    pmaddubsw  xmm2, xmm7

-    pmaddubsw  xmm1, xmm6  // V

-    pmaddubsw  xmm3, xmm6

-    phaddw     xmm0, xmm2

-    phaddw     xmm1, xmm3

-    psraw      xmm0, 8

-    psraw      xmm1, 8

-    packsswb   xmm0, xmm1

-    paddb      xmm0, xmm5            // -> unsigned

-    // step 3 - store 8 U and 8 V values

-    movlps     qword ptr [edx], xmm0 // U

-    movhps     qword ptr [edx + edi], xmm0 // V

-    lea        edx, [edx + 8]

-    sub        ecx, 16

-    jg         convertloop

-    pop        edi

-    ret

-  }

-}

-__declspec(naked)

 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

                        uint8* dst_u, uint8* dst_v, int width) {

   __asm {

@@ -1721,10 +1724,10 @@

     mov        esi, [esp + 8 + 8]   // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

-    mov        ecx, [esp + 8 + 20]  // pix

-    movdqa     xmm5, kAddUV128

-    movdqa     xmm6, kBGRAToV

-    movdqa     xmm7, kBGRAToU

+    mov        ecx, [esp + 8 + 20]  // width

+    movdqa     xmm5, xmmword ptr kAddUV128

+    movdqa     xmm6, xmmword ptr kBGRAToV

+    movdqa     xmm7, xmmword ptr kBGRAToU

     sub        edi, edx             // stride from u to v

  convertloop:

@@ -1791,10 +1794,10 @@

     mov        esi, [esp + 8 + 8]   // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

-    mov        ecx, [esp + 8 + 20]  // pix

-    movdqa     xmm5, kAddUV128

-    movdqa     xmm6, kABGRToV

-    movdqa     xmm7, kABGRToU

+    mov        ecx, [esp + 8 + 20]  // width

+    movdqa     xmm5, xmmword ptr kAddUV128

+    movdqa     xmm6, xmmword ptr kABGRToV

+    movdqa     xmm7, xmmword ptr kABGRToU

     sub        edi, edx             // stride from u to v

  convertloop:

@@ -1861,10 +1864,10 @@

     mov        esi, [esp + 8 + 8]   // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

-    mov        ecx, [esp + 8 + 20]  // pix

-    movdqa     xmm5, kAddUV128

-    movdqa     xmm6, kRGBAToV

-    movdqa     xmm7, kRGBAToU

+    mov        ecx, [esp + 8 + 20]  // width

+    movdqa     xmm5, xmmword ptr kAddUV128

+    movdqa     xmm6, xmmword ptr kRGBAToV

+    movdqa     xmm7, xmmword ptr kRGBAToU

     sub        edi, edx             // stride from u to v

  convertloop:

@@ -1924,33 +1927,62 @@

 // Read 16 UV from 444

 #define READYUV444_AVX2 __asm {                                                \

-    __asm vmovdqu    xmm0, [esi]                  /* U */         /* NOLINT */ \

-    __asm vmovdqu    xmm1, [esi + edi]            /* V */         /* NOLINT */ \

+    __asm vmovdqu    xmm0, [esi]                  /* U */                      \

+    __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \

     __asm lea        esi,  [esi + 16]                                          \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

     __asm vpermq     ymm1, ymm1, 0xd8                                          \

     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \

+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vpermq     ymm4, ymm4, 0xd8                                          \

+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

+    __asm lea        eax, [eax + 16]                                           \

 // Read 8 UV from 422, upsample to 16 UV.

 #define READYUV422_AVX2 __asm {                                                \

-    __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \

-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \

+    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \

+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \

     __asm lea        esi,  [esi + 8]                                           \

     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \

+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vpermq     ymm4, ymm4, 0xd8                                          \

+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

+    __asm lea        eax, [eax + 16]                                           \

+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.

+#define READYUVA422_AVX2 __asm {                                               \

+    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \

+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \

+    __asm lea        esi,  [esi + 8]                                           \

+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \

+    __asm vpermq     ymm0, ymm0, 0xd8                                          \

+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \

+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vpermq     ymm4, ymm4, 0xd8                                          \

+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

+    __asm lea        eax, [eax + 16]                                           \

+    __asm vmovdqu    xmm5, [ebp]                  /* A */                      \

+    __asm vpermq     ymm5, ymm5, 0xd8                                          \

+    __asm lea        ebp, [ebp + 16]                                           \

+  }

 // Read 4 UV from 411, upsample to 16 UV.

 #define READYUV411_AVX2 __asm {                                                \

-    __asm vmovd      xmm0, dword ptr [esi]        /* U */         /* NOLINT */ \

-    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */         /* NOLINT */ \

+    __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \

+    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \

     __asm lea        esi,  [esi + 4]                                           \

     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \

     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

     __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \

+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vpermq     ymm4, ymm4, 0xd8                                          \

+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

+    __asm lea        eax, [eax + 16]                                           \

 // Read 8 UV from NV12, upsample to 16 UV.

@@ -1959,29 +1991,58 @@

     __asm lea        esi,  [esi + 16]                                          \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \

+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vpermq     ymm4, ymm4, 0xd8                                          \

+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

+    __asm lea        eax, [eax + 16]                                           \

+// Read 8 UV from NV21, upsample to 16 UV.

+#define READNV21_AVX2 __asm {                                                  \

+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \

+    __asm lea        esi,  [esi + 16]                                          \

+    __asm vpermq     ymm0, ymm0, 0xd8                                          \

+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \

+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vpermq     ymm4, ymm4, 0xd8                                          \

+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

+    __asm lea        eax, [eax + 16]                                           \

+  }

+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.

+#define READYUY2_AVX2 __asm {                                                  \

+    __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \

+    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \

+    __asm vmovdqu    ymm0, [eax]          /* UV */                             \

+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \

+    __asm lea        eax, [eax + 32]                                           \

+  }

+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.

+#define READUYVY_AVX2 __asm {                                                  \

+    __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \

+    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \

+    __asm vmovdqu    ymm0, [eax]          /* UV */                             \

+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \

+    __asm lea        eax, [eax + 32]                                           \

+  }

 // Convert 16 pixels: 16 UV and 16 Y.

 #define YUVTORGB_AVX2(YuvConstants) __asm {                                    \

-    /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \

-    __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \

-    __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \

-    __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \

-    __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \

+    __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\

+    __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\

+    __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\

+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \

     __asm vpsubw     ymm2, ymm3, ymm2                                          \

-    __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \

+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \

     __asm vpsubw     ymm1, ymm3, ymm1                                          \

-    __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \

+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \

     __asm vpsubw     ymm0, ymm3, ymm0                                          \

     /* Step 2: Find Y contribution to 16 R,G,B values */                       \

-    __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \

-    __asm lea        eax, [eax + 16]                                           \

-    __asm vpermq     ymm3, ymm3, 0xd8                                          \

-    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \

-    __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \

-    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \

-    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \

-    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \

+    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \

+    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \

+    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \

+    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \

     __asm vpsraw     ymm0, ymm0, 6                                             \

     __asm vpsraw     ymm1, ymm1, 6                                             \

     __asm vpsraw     ymm2, ymm2, 6                                             \

@@ -1992,7 +2053,6 @@

 // Store 16 ARGB values.

 #define STOREARGB_AVX2 __asm {                                                 \

-    /* Step 3: Weave into ARGB */                                              \

     __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

     __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \

@@ -2004,6 +2064,19 @@

     __asm lea        edx,  [edx + 64]                                          \

+// Store 16 RGBA values.

+#define STORERGBA_AVX2 __asm {                                                 \

+    __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \

+    __asm vpermq     ymm1, ymm1, 0xd8                                          \

+    __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \

+    __asm vpermq     ymm2, ymm2, 0xd8                                          \

+    __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \

+    __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \

+    __asm vmovdqu    [edx], ymm0                                               \

+    __asm vmovdqu    [edx + 32], ymm1                                          \

+    __asm lea        edx,  [edx + 64]                                          \

+  }

 #ifdef HAS_I422TOARGBROW_AVX2

 // 16 pixels

 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

@@ -2012,26 +2085,30 @@

                         const uint8* u_buf,

                         const uint8* v_buf,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // argb

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // argb

+    mov        ebx, [esp + 12 + 20]  // yuvconstants

+    mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

  convertloop:

     READYUV422_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

+    YUVTORGB_AVX2(ebx)

     STOREARGB_AVX2

     sub        ecx, 16

     jg         convertloop

+    pop        ebx

     pop        edi

     pop        esi

     vzeroupper

@@ -2040,34 +2117,41 @@

 #endif  // HAS_I422TOARGBROW_AVX2

-#ifdef HAS_J422TOARGBROW_AVX2

+#ifdef HAS_I422ALPHATOARGBROW_AVX2

 // 16 pixels

-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.

 __declspec(naked)

-void J422ToARGBRow_AVX2(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* dst_argb,

-                        int width) {

+void I422AlphaToARGBRow_AVX2(const uint8* y_buf,

+                             const uint8* u_buf,

+                             const uint8* v_buf,

+                             const uint8* a_buf,

+                             uint8* dst_argb,

+                             const struct YuvConstants* yuvconstants,

+                             int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // argb

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    push       ebp

+    mov        eax, [esp + 16 + 4]   // Y

+    mov        esi, [esp + 16 + 8]   // U

+    mov        edi, [esp + 16 + 12]  // V

+    mov        ebp, [esp + 16 + 16]  // A

+    mov        edx, [esp + 16 + 20]  // argb

+    mov        ebx, [esp + 16 + 24]  // yuvconstants

+    mov        ecx, [esp + 16 + 28]  // width

     sub        edi, esi

-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

  convertloop:

-    READYUV422_AVX2

-    YUVTORGB_AVX2(kYuvJConstants)

+    READYUVA422_AVX2

+    YUVTORGB_AVX2(ebx)

     STOREARGB_AVX2

     sub        ecx, 16

     jg         convertloop

+    pop        ebp

+    pop        ebx

     pop        edi

     pop        esi

     vzeroupper

@@ -2074,7 +2158,7 @@

ret

-#endif  // HAS_J422TOARGBROW_AVX2

+#endif  // HAS_I422ALPHATOARGBROW_AVX2

 #ifdef HAS_I444TOARGBROW_AVX2

 // 16 pixels

@@ -2084,26 +2168,29 @@

                         const uint8* u_buf,

                         const uint8* v_buf,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // argb

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // argb

+    mov        ebx, [esp + 12 + 20]  // yuvconstants

+    mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

  convertloop:

     READYUV444_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

+    YUVTORGB_AVX2(ebx)

     STOREARGB_AVX2

     sub        ecx, 16

     jg         convertloop

+    pop        ebx

     pop        edi

     pop        esi

     vzeroupper

@@ -2120,26 +2207,30 @@

                         const uint8* u_buf,

                         const uint8* v_buf,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // argb

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // abgr

+    mov        ebx, [esp + 12 + 20]  // yuvconstants

+    mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

  convertloop:

     READYUV411_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

+    YUVTORGB_AVX2(ebx)

     STOREARGB_AVX2

     sub        ecx, 16

     jg         convertloop

+    pop        ebx

     pop        edi

     pop        esi

     vzeroupper

@@ -2155,23 +2246,27 @@

 void NV12ToARGBRow_AVX2(const uint8* y_buf,

                         const uint8* uv_buf,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // Y

-    mov        esi, [esp + 4 + 8]   // UV

-    mov        edx, [esp + 4 + 12]  // argb

-    mov        ecx, [esp + 4 + 16]  // width

+    push       ebx

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // UV

+    mov        edx, [esp + 8 + 12]  // argb

+    mov        ebx, [esp + 8 + 16]  // yuvconstants

+    mov        ecx, [esp + 8 + 20]  // width

     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

  convertloop:

     READNV12_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

+    YUVTORGB_AVX2(ebx)

     STOREARGB_AVX2

     sub        ecx, 16

     jg         convertloop

+    pop        ebx

     pop        esi

     vzeroupper

ret

@@ -2181,28 +2276,32 @@

 #ifdef HAS_NV21TOARGBROW_AVX2

 // 16 pixels.

-// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).

+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

 __declspec(naked)

 void NV21ToARGBRow_AVX2(const uint8* y_buf,

-                        const uint8* uv_buf,

+                        const uint8* vu_buf,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // Y

-    mov        esi, [esp + 4 + 8]   // UV

-    mov        edx, [esp + 4 + 12]  // argb

-    mov        ecx, [esp + 4 + 16]  // width

+    push       ebx

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // VU

+    mov        edx, [esp + 8 + 12]  // argb

+    mov        ebx, [esp + 8 + 16]  // yuvconstants

+    mov        ecx, [esp + 8 + 20]  // width

     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

  convertloop:

-    READNV12_AVX2

-    YUVTORGB_AVX2(kYvuConstants)

+    READNV21_AVX2

+    YUVTORGB_AVX2(ebx)

     STOREARGB_AVX2

     sub        ecx, 16

     jg         convertloop

+    pop        ebx

     pop        esi

     vzeroupper

ret

@@ -2210,136 +2309,100 @@

 #endif  // HAS_NV21TOARGBROW_AVX2

-#ifdef HAS_I422TOBGRAROW_AVX2

-// 16 pixels

-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).

-// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.

+#ifdef HAS_YUY2TOARGBROW_AVX2

+// 16 pixels.

+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).

 __declspec(naked)

-void I422ToBGRARow_AVX2(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   __asm {

-    push       esi

-    push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // argb

-    mov        ecx, [esp + 8 + 20]  // width

-    sub        edi, esi

+    push       ebx

+    mov        eax, [esp + 4 + 4]   // yuy2

+    mov        edx, [esp + 4 + 8]   // argb

+    mov        ebx, [esp + 4 + 12]  // yuvconstants

+    mov        ecx, [esp + 4 + 16]  // width

     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

  convertloop:

-    READYUV422_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

+    READYUY2_AVX2

+    YUVTORGB_AVX2(ebx)

+    STOREARGB_AVX2

-    // Step 3: Weave into BGRA

-    vpunpcklbw ymm1, ymm1, ymm0           // GB

-    vpermq     ymm1, ymm1, 0xd8

-    vpunpcklbw ymm2, ymm5, ymm2           // AR

-    vpermq     ymm2, ymm2, 0xd8

-    vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels

-    vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels

-    vmovdqu    [edx], ymm0

-    vmovdqu    [edx + 32], ymm2

-    lea        edx,  [edx + 64]

     sub        ecx, 16

     jg         convertloop

-    pop        edi

-    pop        esi

+    pop        ebx

     vzeroupper

ret

-#endif  // HAS_I422TOBGRAROW_AVX2

+#endif  // HAS_YUY2TOARGBROW_AVX2

-#ifdef HAS_I422TORGBAROW_AVX2

-// 16 pixels

-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).

-// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.

+#ifdef HAS_UYVYTOARGBROW_AVX2

+// 16 pixels.

+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).

 __declspec(naked)

-void I422ToRGBARow_AVX2(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   __asm {

-    push       esi

-    push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // argb

-    mov        ecx, [esp + 8 + 20]  // width

-    sub        edi, esi

+    push       ebx

+    mov        eax, [esp + 4 + 4]   // uyvy

+    mov        edx, [esp + 4 + 8]   // argb

+    mov        ebx, [esp + 4 + 12]  // yuvconstants

+    mov        ecx, [esp + 4 + 16]  // width

     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

  convertloop:

-    READYUV422_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

+    READUYVY_AVX2

+    YUVTORGB_AVX2(ebx)

+    STOREARGB_AVX2

-    // Step 3: Weave into RGBA

-    vpunpcklbw ymm1, ymm1, ymm2           // GR

-    vpermq     ymm1, ymm1, 0xd8

-    vpunpcklbw ymm2, ymm5, ymm0           // AB

-    vpermq     ymm2, ymm2, 0xd8

-    vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels

-    vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels

-    vmovdqu    [edx], ymm0

-    vmovdqu    [edx + 32], ymm1

-    lea        edx,  [edx + 64]

     sub        ecx, 16

     jg         convertloop

-    pop        edi

-    pop        esi

+    pop        ebx

     vzeroupper

ret

-#endif  // HAS_I422TORGBAROW_AVX2

+#endif  // HAS_UYVYTOARGBROW_AVX2

-#ifdef HAS_I422TOABGRROW_AVX2

+#ifdef HAS_I422TORGBAROW_AVX2

 // 16 pixels

-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).

-// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.

+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).

 __declspec(naked)

-void I422ToABGRRow_AVX2(const uint8* y_buf,

+void I422ToRGBARow_AVX2(const uint8* y_buf,

                         const uint8* u_buf,

                         const uint8* v_buf,

                         uint8* dst_argb,

+                        const struct YuvConstants* yuvconstants,

                         int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // argb

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // abgr

+    mov        ebx, [esp + 12 + 20]  // yuvconstants

+    mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

  convertloop:

     READYUV422_AVX2

-    YUVTORGB_AVX2(kYuvConstants)

+    YUVTORGB_AVX2(ebx)

+    STORERGBA_AVX2

-    // Step 3: Weave into ABGR

-    vpunpcklbw ymm1, ymm2, ymm1           // RG

-    vpermq     ymm1, ymm1, 0xd8

-    vpunpcklbw ymm2, ymm0, ymm5           // BA

-    vpermq     ymm2, ymm2, 0xd8

-    vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels

-    vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels

-    vmovdqu    [edx], ymm0

-    vmovdqu    [edx + 32], ymm1

-    lea        edx,  [edx + 64]

     sub        ecx, 16

     jg         convertloop

+    pop        ebx

     pop        edi

     pop        esi

     vzeroupper

@@ -2346,17 +2409,21 @@

ret

-#endif  // HAS_I422TOABGRROW_AVX2

+#endif  // HAS_I422TORGBAROW_AVX2

 #if defined(HAS_I422TOARGBROW_SSSE3)

 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.

+// Allows a conversion with half size scaling.

 // Read 8 UV from 444.

 #define READYUV444 __asm {                                                     \

-    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \

-    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \

+    __asm movq       xmm0, qword ptr [esi] /* U */                             \

+    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \

     __asm lea        esi,  [esi + 8]                                           \

     __asm punpcklbw  xmm0, xmm1           /* UV */                             \

+    __asm movq       xmm4, qword ptr [eax]                                     \

+    __asm punpcklbw  xmm4, xmm4                                                \

+    __asm lea        eax, [eax + 8]                                            \

 // Read 4 UV from 422, upsample to 8 UV.

@@ -2366,50 +2433,99 @@

     __asm lea        esi,  [esi + 4]                                           \

     __asm punpcklbw  xmm0, xmm1           /* UV */                             \

     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \

+    __asm movq       xmm4, qword ptr [eax]                                     \

+    __asm punpcklbw  xmm4, xmm4                                                \

+    __asm lea        eax, [eax + 8]                                            \

+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.

+#define READYUVA422 __asm {                                                    \

+    __asm movd       xmm0, [esi]          /* U */                              \

+    __asm movd       xmm1, [esi + edi]    /* V */                              \

+    __asm lea        esi,  [esi + 4]                                           \

+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \

+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \

+    __asm movq       xmm4, qword ptr [eax]   /* Y */                           \

+    __asm punpcklbw  xmm4, xmm4                                                \

+    __asm lea        eax, [eax + 8]                                            \

+    __asm movq       xmm5, qword ptr [ebp]   /* A */                           \

+    __asm lea        ebp, [ebp + 8]                                            \

+  }

 // Read 2 UV from 411, upsample to 8 UV.

-#define READYUV411 __asm {                                                     \

-    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \

+// drmemory fails with memory fault if pinsrw used. libyuv bug: 525

+//  __asm pinsrw     xmm0, [esi], 0        /* U */

+//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */

+#define READYUV411_EBX __asm {                                                 \

+    __asm movzx      ebx, word ptr [esi]        /* U */                        \

     __asm movd       xmm0, ebx                                                 \

-    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \

+    __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \

     __asm movd       xmm1, ebx                                                 \

     __asm lea        esi,  [esi + 2]                                           \

-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \

-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \

-    __asm punpckldq  xmm0, xmm0           /* UVUVUVUV (upsample) */            \

+    __asm punpcklbw  xmm0, xmm1            /* UV */                            \

+    __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \

+    __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \

+    __asm movq       xmm4, qword ptr [eax]                                     \

+    __asm punpcklbw  xmm4, xmm4                                                \

+    __asm lea        eax, [eax + 8]                                            \

 // Read 4 UV from NV12, upsample to 8 UV.

 #define READNV12 __asm {                                                       \

-    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \

+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \

     __asm lea        esi,  [esi + 8]                                           \

     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \

+    __asm movq       xmm4, qword ptr [eax]                                     \

+    __asm punpcklbw  xmm4, xmm4                                                \

+    __asm lea        eax, [eax + 8]                                            \

+// Read 4 VU from NV21, upsample to 8 UV.

+#define READNV21 __asm {                                                       \

+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \

+    __asm lea        esi,  [esi + 8]                                           \

+    __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \

+    __asm movq       xmm4, qword ptr [eax]                                     \

+    __asm punpcklbw  xmm4, xmm4                                                \

+    __asm lea        eax, [eax + 8]                                            \

+  }

+// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.

+#define READYUY2 __asm {                                                       \

+    __asm movdqu     xmm4, [eax]          /* YUY2 */                           \

+    __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \

+    __asm movdqu     xmm0, [eax]          /* UV */                             \

+    __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \

+    __asm lea        eax, [eax + 16]                                           \

+  }

+// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.

+#define READUYVY __asm {                                                       \

+    __asm movdqu     xmm4, [eax]          /* UYVY */                           \

+    __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \

+    __asm movdqu     xmm0, [eax]          /* UV */                             \

+    __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \

+    __asm lea        eax, [eax + 16]                                           \

+  }

 // Convert 8 pixels: 8 UV and 8 Y.

 #define YUVTORGB(YuvConstants) __asm {                                         \

-    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \

     __asm movdqa     xmm1, xmm0                                                \

     __asm movdqa     xmm2, xmm0                                                \

     __asm movdqa     xmm3, xmm0                                                \

-    __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \

-    __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \

+    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \

+    __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \

     __asm psubw      xmm0, xmm1                                                \

-    __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \

-    __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \

+    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \

+    __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \

     __asm psubw      xmm1, xmm2                                                \

-    __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \

-    __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \

+    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \

+    __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \

     __asm psubw      xmm2, xmm3                                                \

-    /* Step 2: Find Y contribution to 8 R,G,B values */                        \

-    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \

-    __asm lea        eax, [eax + 8]                                            \

-    __asm punpcklbw  xmm3, xmm3                                                \

-    __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \

-    __asm paddsw     xmm0, xmm3           /* B += Y */                         \

-    __asm paddsw     xmm1, xmm3           /* G += Y */                         \

-    __asm paddsw     xmm2, xmm3           /* R += Y */                         \

+    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \

+    __asm paddsw     xmm0, xmm4           /* B += Y */                         \

+    __asm paddsw     xmm1, xmm4           /* G += Y */                         \

+    __asm paddsw     xmm2, xmm4           /* R += Y */                         \

     __asm psraw      xmm0, 6                                                   \

     __asm psraw      xmm1, 6                                                   \

     __asm psraw      xmm2, 6                                                   \

@@ -2420,7 +2536,6 @@

 // Store 8 ARGB values.

 #define STOREARGB __asm {                                                      \

-    /* Step 3: Weave into ARGB */                                              \

     __asm punpcklbw  xmm0, xmm1           /* BG */                             \

     __asm punpcklbw  xmm2, xmm5           /* RA */                             \

     __asm movdqa     xmm1, xmm0                                                \

@@ -2433,7 +2548,6 @@

 // Store 8 BGRA values.

 #define STOREBGRA __asm {                                                      \

-    /* Step 3: Weave into BGRA */                                              \

     __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \

     __asm punpcklbw  xmm1, xmm0           /* GB */                             \

     __asm punpcklbw  xmm5, xmm2           /* AR */                             \

@@ -2445,22 +2559,8 @@

     __asm lea        edx,  [edx + 32]                                          \

-// Store 8 ABGR values.

-#define STOREABGR __asm {                                                      \

-    /* Step 3: Weave into ABGR */                                              \

-    __asm punpcklbw  xmm2, xmm1           /* RG */                             \

-    __asm punpcklbw  xmm0, xmm5           /* BA */                             \

-    __asm movdqa     xmm1, xmm2                                                \

-    __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \

-    __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \

-    __asm movdqu     0[edx], xmm2                                              \

-    __asm movdqu     16[edx], xmm1                                             \

-    __asm lea        edx,  [edx + 32]                                          \

-  }

 // Store 8 RGBA values.

 #define STORERGBA __asm {                                                      \

-    /* Step 3: Weave into RGBA */                                              \

     __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \

     __asm punpcklbw  xmm1, xmm2           /* GR */                             \

     __asm punpcklbw  xmm5, xmm0           /* AB */                             \

@@ -2474,13 +2574,13 @@

 // Store 8 RGB24 values.

 #define STORERGB24 __asm {                                                     \

-    /* Step 3: Weave into RRGB */                                              \

+    /* Weave into RRGB */                                                      \

     __asm punpcklbw  xmm0, xmm1           /* BG */                             \

     __asm punpcklbw  xmm2, xmm2           /* RR */                             \

     __asm movdqa     xmm1, xmm0                                                \

     __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \

     __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \

-    /* Step 4: RRGB -> RGB24 */                                                \

+    /* RRGB -> RGB24 */                                                        \

     __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \

     __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \

     __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \

@@ -2489,32 +2589,15 @@

     __asm lea        edx,  [edx + 24]                                          \

-// Store 8 RAW values.

-#define STORERAW __asm {                                                       \

-    /* Step 3: Weave into RRGB */                                              \

-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \

-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \

-    __asm movdqa     xmm1, xmm0                                                \

-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \

-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \

-    /* Step 4: RRGB -> RAW */                                                  \

-    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \

-    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \

-    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \

-    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \

-    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \

-    __asm lea        edx,  [edx + 24]                                          \

-  }

 // Store 8 RGB565 values.

 #define STORERGB565 __asm {                                                    \

-    /* Step 3: Weave into RRGB */                                              \

+    /* Weave into RRGB */                                                      \

     __asm punpcklbw  xmm0, xmm1           /* BG */                             \

     __asm punpcklbw  xmm2, xmm2           /* RR */                             \

     __asm movdqa     xmm1, xmm0                                                \

     __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \

     __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \

-    /* Step 4: RRGB -> RGB565 */                                               \

+    /* RRGB -> RGB565 */                                                       \

     __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \

     __asm movdqa     xmm2, xmm0    /* G */                                     \

     __asm pslld      xmm0, 8       /* R */                                     \

@@ -2549,26 +2632,30 @@

                          const uint8* u_buf,

                          const uint8* v_buf,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // argb

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // argb

+    mov        ebx, [esp + 12 + 20]  // yuvconstants

+    mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha

  convertloop:

     READYUV444

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(ebx)

     STOREARGB

     sub        ecx, 8

     jg         convertloop

+    pop        ebx

     pop        edi

     pop        esi

ret

@@ -2582,27 +2669,31 @@

                           const uint8* u_buf,

                           const uint8* v_buf,

                           uint8* dst_rgb24,

+                          const struct YuvConstants* yuvconstants,

                           int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // rgb24

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // argb

+    mov        ebx, [esp + 12 + 20]  // yuvconstants

+    mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

-    movdqa     xmm5, kShuffleMaskARGBToRGB24_0

-    movdqa     xmm6, kShuffleMaskARGBToRGB24

+    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0

+    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24

  convertloop:

     READYUV422

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(ebx)

     STORERGB24

     sub        ecx, 8

     jg         convertloop

+    pop        ebx

     pop        edi

     pop        esi

ret

@@ -2609,40 +2700,6 @@

-// 8 pixels.

-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).

-__declspec(naked)

-void I422ToRAWRow_SSSE3(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* dst_raw,

-                        int width) {

-  __asm {

-    push       esi

-    push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // raw

-    mov        ecx, [esp + 8 + 20]  // width

-    sub        edi, esi

-    movdqa     xmm5, kShuffleMaskARGBToRAW_0

-    movdqa     xmm6, kShuffleMaskARGBToRAW

- convertloop:

-    READYUV422

-    YUVTORGB(kYuvConstants)

-    STORERAW

-    sub        ecx, 8

-    jg         convertloop

-    pop        edi

-    pop        esi

-    ret

-  }

-}

 // 8 pixels

 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).

 __declspec(naked)

@@ -2650,15 +2707,18 @@

                            const uint8* u_buf,

                            const uint8* v_buf,

                            uint8* rgb565_buf,

+                           const struct YuvConstants* yuvconstants,

                            int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // rgb565

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // argb

+    mov        ebx, [esp + 12 + 20]  // yuvconstants

+    mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

     pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f

     psrld      xmm5, 27

@@ -2670,12 +2730,13 @@

  convertloop:

     READYUV422

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(ebx)

     STORERGB565

     sub        ecx, 8

     jg         convertloop

+    pop        ebx

     pop        edi

     pop        esi

ret

@@ -2689,26 +2750,30 @@

                          const uint8* u_buf,

                          const uint8* v_buf,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // argb

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // argb

+    mov        ebx, [esp + 12 + 20]  // yuvconstants

+    mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

  convertloop:

     READYUV422

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(ebx)

     STOREARGB

     sub        ecx, 8

     jg         convertloop

+    pop        ebx

     pop        edi

     pop        esi

ret

@@ -2716,33 +2781,39 @@

 // 8 pixels.

-// JPeg color space version of I422ToARGB

-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.

 __declspec(naked)

-void J422ToARGBRow_SSSE3(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* dst_argb,

-                         int width) {

+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,

+                              const uint8* u_buf,

+                              const uint8* v_buf,

+                              const uint8* a_buf,

+                              uint8* dst_argb,

+                              const struct YuvConstants* yuvconstants,

+                              int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // argb

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    push       ebp

+    mov        eax, [esp + 16 + 4]   // Y

+    mov        esi, [esp + 16 + 8]   // U

+    mov        edi, [esp + 16 + 12]  // V

+    mov        ebp, [esp + 16 + 16]  // A

+    mov        edx, [esp + 16 + 20]  // argb

+    mov        ebx, [esp + 16 + 24]  // yuvconstants

+    mov        ecx, [esp + 16 + 28]  // width

     sub        edi, esi

-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

  convertloop:

-    READYUV422

-    YUVTORGB(kYuvJConstants)

+    READYUVA422

+    YUVTORGB(ebx)

     STOREARGB

     sub        ecx, 8

     jg         convertloop

+    pop        ebp

+    pop        ebx

     pop        edi

     pop        esi

ret

@@ -2757,30 +2828,34 @@

                          const uint8* u_buf,

                          const uint8* v_buf,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   __asm {

-    push       ebx

     push       esi

     push       edi

-    mov        eax, [esp + 12 + 4]   // Y

-    mov        esi, [esp + 12 + 8]   // U

-    mov        edi, [esp + 12 + 12]  // V

-    mov        edx, [esp + 12 + 16]  // argb

-    mov        ecx, [esp + 12 + 20]  // width

+    push       ebx

+    push       ebp

+    mov        eax, [esp + 16 + 4]   // Y

+    mov        esi, [esp + 16 + 8]   // U

+    mov        edi, [esp + 16 + 12]  // V

+    mov        edx, [esp + 16 + 16]  // abgr

+    mov        ebp, [esp + 16 + 20]  // yuvconstants

+    mov        ecx, [esp + 16 + 24]  // width

     sub        edi, esi

     pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha

  convertloop:

-    READYUV411  // modifies EBX

-    YUVTORGB(kYuvConstants)

+    READYUV411_EBX

+    YUVTORGB(ebp)

     STOREARGB

     sub        ecx, 8

     jg         convertloop

+    pop        ebp

+    pop        ebx

     pop        edi

     pop        esi

-    pop        ebx

ret

@@ -2791,23 +2866,27 @@

 void NV12ToARGBRow_SSSE3(const uint8* y_buf,

                          const uint8* uv_buf,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // Y

-    mov        esi, [esp + 4 + 8]   // UV

-    mov        edx, [esp + 4 + 12]  // argb

-    mov        ecx, [esp + 4 + 16]  // width

+    push       ebx

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // UV

+    mov        edx, [esp + 8 + 12]  // argb

+    mov        ebx, [esp + 8 + 16]  // yuvconstants

+    mov        ecx, [esp + 8 + 20]  // width

     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

  convertloop:

     READNV12

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(ebx)

     STOREARGB

     sub        ecx, 8

     jg         convertloop

+    pop        ebx

     pop        esi

ret

@@ -2814,90 +2893,89 @@

 // 8 pixels.

-// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).

+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

 __declspec(naked)

 void NV21ToARGBRow_SSSE3(const uint8* y_buf,

-                         const uint8* uv_buf,

+                         const uint8* vu_buf,

                          uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // Y

-    mov        esi, [esp + 4 + 8]   // UV

-    mov        edx, [esp + 4 + 12]  // argb

-    mov        ecx, [esp + 4 + 16]  // width

+    push       ebx

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // VU

+    mov        edx, [esp + 8 + 12]  // argb

+    mov        ebx, [esp + 8 + 16]  // yuvconstants

+    mov        ecx, [esp + 8 + 20]  // width

     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

  convertloop:

-    READNV12

-    YUVTORGB(kYvuConstants)

+    READNV21

+    YUVTORGB(ebx)

     STOREARGB

     sub        ecx, 8

     jg         convertloop

+    pop        ebx

     pop        esi

ret

+// 8 pixels.

+// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).

 __declspec(naked)

-void I422ToBGRARow_SSSE3(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* dst_bgra,

+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,

+                         uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   __asm {

-    push       esi

-    push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // bgra

-    mov        ecx, [esp + 8 + 20]  // width

-    sub        edi, esi

+    push       ebx

+    mov        eax, [esp + 4 + 4]   // yuy2

+    mov        edx, [esp + 4 + 8]   // argb

+    mov        ebx, [esp + 4 + 12]  // yuvconstants

+    mov        ecx, [esp + 4 + 16]  // width

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

  convertloop:

-    READYUV422

-    YUVTORGB(kYuvConstants)

-    STOREBGRA

+    READYUY2

+    YUVTORGB(ebx)

+    STOREARGB

     sub        ecx, 8

     jg         convertloop

-    pop        edi

-    pop        esi

+    pop        ebx

ret

+// 8 pixels.

+// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).

 __declspec(naked)

-void I422ToABGRRow_SSSE3(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* dst_abgr,

+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,

+                         uint8* dst_argb,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   __asm {

-    push       esi

-    push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // abgr

-    mov        ecx, [esp + 8 + 20]  // width

-    sub        edi, esi

+    push       ebx

+    mov        eax, [esp + 4 + 4]   // uyvy

+    mov        edx, [esp + 4 + 8]   // argb

+    mov        ebx, [esp + 4 + 12]  // yuvconstants

+    mov        ecx, [esp + 4 + 16]  // width

     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

  convertloop:

-    READYUV422

-    YUVTORGB(kYuvConstants)

-    STOREABGR

+    READUYVY

+    YUVTORGB(ebx)

+    STOREARGB

     sub        ecx, 8

     jg         convertloop

-    pop        edi

-    pop        esi

+    pop        ebx

ret

@@ -2907,31 +2985,34 @@

                          const uint8* u_buf,

                          const uint8* v_buf,

                          uint8* dst_rgba,

+                         const struct YuvConstants* yuvconstants,

                          int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // U

-    mov        edi, [esp + 8 + 12]  // V

-    mov        edx, [esp + 8 + 16]  // rgba

-    mov        ecx, [esp + 8 + 20]  // width

+    push       ebx

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // argb

+    mov        ebx, [esp + 12 + 20]  // yuvconstants

+    mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

  convertloop:

     READYUV422

-    YUVTORGB(kYuvConstants)

+    YUVTORGB(ebx)

     STORERGBA

     sub        ecx, 8

     jg         convertloop

+    pop        ebx

     pop        edi

     pop        esi

ret

 #endif  // HAS_I422TOARGBROW_SSSE3

 #ifdef HAS_I400TOARGBROW_SSE2

@@ -3045,7 +3126,7 @@

     mov       eax, [esp + 4]   // src

     mov       edx, [esp + 8]   // dst

     mov       ecx, [esp + 12]  // width

-    movdqa    xmm5, kShuffleMirror

+    movdqa    xmm5, xmmword ptr kShuffleMirror

  convertloop:

     movdqu    xmm0, [eax - 16 + ecx]

@@ -3066,7 +3147,7 @@

     mov       eax, [esp + 4]   // src

     mov       edx, [esp + 8]   // dst

     mov       ecx, [esp + 12]  // width

-    vbroadcastf128 ymm5, kShuffleMirror

+    vbroadcastf128 ymm5, xmmword ptr kShuffleMirror

  convertloop:

     vmovdqu   ymm0, [eax - 32 + ecx]

@@ -3082,33 +3163,7 @@

 #endif  // HAS_MIRRORROW_AVX2

-#ifdef HAS_MIRRORROW_SSE2

-__declspec(naked)

-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {

-  __asm {

-    mov       eax, [esp + 4]   // src

-    mov       edx, [esp + 8]   // dst

-    mov       ecx, [esp + 12]  // width

- convertloop:

-    movdqu    xmm0, [eax - 16 + ecx]

-    movdqa    xmm1, xmm0        // swap bytes

-    psllw     xmm0, 8

-    psrlw     xmm1, 8

-    por       xmm0, xmm1

-    pshuflw   xmm0, xmm0, 0x1b  // swap words

-    pshufhw   xmm0, xmm0, 0x1b

-    pshufd    xmm0, xmm0, 0x4e  // swap qwords

-    movdqu    [edx], xmm0

-    lea       edx, [edx + 16]

-    sub       ecx, 16

-    jg        convertloop

-    ret

-  }

-}

-#endif  // HAS_MIRRORROW_SSE2

-#ifdef HAS_MIRRORROW_UV_SSSE3

+#ifdef HAS_MIRRORUVROW_SSSE3

 // Shuffle table for reversing the bytes of UV channels.

 static const uvec8 kShuffleMirrorUV = {

   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u

@@ -3123,7 +3178,7 @@

     mov       edx, [esp + 4 + 8]   // dst_u

     mov       edi, [esp + 4 + 12]  // dst_v

     mov       ecx, [esp + 4 + 16]  // width

-    movdqa    xmm1, kShuffleMirrorUV

+    movdqa    xmm1, xmmword ptr kShuffleMirrorUV

     lea       eax, [eax + ecx * 2 - 16]

     sub       edi, edx

@@ -3141,7 +3196,7 @@

ret

-#endif  // HAS_MIRRORROW_UV_SSSE3

+#endif  // HAS_MIRRORUVROW_SSSE3

 #ifdef HAS_ARGBMIRRORROW_SSE2

 __declspec(naked)

@@ -3177,7 +3232,7 @@

     mov       eax, [esp + 4]   // src

     mov       edx, [esp + 8]   // dst

     mov       ecx, [esp + 12]  // width

-    vmovdqu   ymm5, kARGBShuffleMirror_AVX2

+    vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2

  convertloop:

     vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order

@@ -3193,13 +3248,14 @@

 #ifdef HAS_SPLITUVROW_SSE2

 __declspec(naked)

-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {

+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width) {

   __asm {

     push       edi

     mov        eax, [esp + 4 + 4]    // src_uv

     mov        edx, [esp + 4 + 8]    // dst_u

     mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // pix

+    mov        ecx, [esp + 4 + 16]   // width

     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

     psrlw      xmm5, 8

     sub        edi, edx

@@ -3231,13 +3287,14 @@

 #ifdef HAS_SPLITUVROW_AVX2

 __declspec(naked)

-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {

+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width) {

   __asm {

     push       edi

     mov        eax, [esp + 4 + 4]    // src_uv

     mov        edx, [esp + 4 + 8]    // dst_u

     mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // pix

+    mov        ecx, [esp + 4 + 16]   // width

     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

     sub        edi, edx

@@ -3339,8 +3396,23 @@

     mov        eax, [esp + 4]   // src

     mov        edx, [esp + 8]   // dst

     mov        ecx, [esp + 12]  // count

+    test       eax, 15

+    jne        convertloopu

+    test       edx, 15

+    jne        convertloopu

-  convertloop:

+  convertloopa:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax, [eax + 32]

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx, [edx + 32]

+    sub        ecx, 32

+    jg         convertloopa

+    ret

+  convertloopu:

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

     lea        eax, [eax + 32]

@@ -3348,7 +3420,7 @@

     movdqu     [edx + 16], xmm1

     lea        edx, [edx + 32]

     sub        ecx, 32

-    jg         convertloop

+    jg         convertloopu

ret

@@ -3460,6 +3532,33 @@

 #endif  // HAS_ARGBCOPYALPHAROW_AVX2

+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2

+// width in pixels

+__declspec(naked)

+void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {

+  __asm {

+    mov        eax, [esp + 4]   // src_argb

+    mov        edx, [esp + 8]   // dst_a

+    mov        ecx, [esp + 12]  // width

+  extractloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    lea        eax, [eax + 32]

+    psrld      xmm0, 24

+    psrld      xmm1, 24

+    packssdw   xmm0, xmm1

+    packuswb   xmm0, xmm0

+    movq       qword ptr [edx], xmm0

+    lea        edx, [edx + 8]

+    sub        ecx, 8

+    jg         extractloop

+    ret

+  }

+}

+#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2

 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2

 // width in pixels

 __declspec(naked)

@@ -3579,12 +3678,11 @@

 #ifdef HAS_YUY2TOYROW_AVX2

 __declspec(naked)

-void YUY2ToYRow_AVX2(const uint8* src_yuy2,

-                     uint8* dst_y, int pix) {

+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]    // src_yuy2

     mov        edx, [esp + 8]    // dst_y

-    mov        ecx, [esp + 12]   // pix

+    mov        ecx, [esp + 12]   // width

     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

@@ -3607,7 +3705,7 @@

 __declspec(naked)

 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   __asm {

     push       esi

     push       edi

@@ -3615,7 +3713,7 @@

     mov        esi, [esp + 8 + 8]    // stride_yuy2

     mov        edx, [esp + 8 + 12]   // dst_u

     mov        edi, [esp + 8 + 16]   // dst_v

-    mov        ecx, [esp + 8 + 20]   // pix

+    mov        ecx, [esp + 8 + 20]   // width

     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

     sub        edi, edx

@@ -3651,13 +3749,13 @@

 __declspec(naked)

 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int pix) {

+                         uint8* dst_u, uint8* dst_v, int width) {

   __asm {

     push       edi

     mov        eax, [esp + 4 + 4]    // src_yuy2

     mov        edx, [esp + 4 + 8]    // dst_u

     mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // pix

+    mov        ecx, [esp + 4 + 16]   // width

     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

     sub        edi, edx

@@ -3690,11 +3788,11 @@

 __declspec(naked)

 void UYVYToYRow_AVX2(const uint8* src_uyvy,

-                     uint8* dst_y, int pix) {

+                     uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]    // src_uyvy

     mov        edx, [esp + 8]    // dst_y

-    mov        ecx, [esp + 12]   // pix

+    mov        ecx, [esp + 12]   // width

   convertloop:

     vmovdqu    ymm0, [eax]

@@ -3715,7 +3813,7 @@

 __declspec(naked)

 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   __asm {

     push       esi

     push       edi

@@ -3723,7 +3821,7 @@

     mov        esi, [esp + 8 + 8]    // stride_yuy2

     mov        edx, [esp + 8 + 12]   // dst_u

     mov        edi, [esp + 8 + 16]   // dst_v

-    mov        ecx, [esp + 8 + 20]   // pix

+    mov        ecx, [esp + 8 + 20]   // width

     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

     sub        edi, edx

@@ -3759,13 +3857,13 @@

 __declspec(naked)

 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int pix) {

+                         uint8* dst_u, uint8* dst_v, int width) {

   __asm {

     push       edi

     mov        eax, [esp + 4 + 4]    // src_yuy2

     mov        edx, [esp + 4 + 8]    // dst_u

     mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // pix

+    mov        ecx, [esp + 4 + 16]   // width

     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

     sub        edi, edx

@@ -3800,11 +3898,11 @@

 #ifdef HAS_YUY2TOYROW_SSE2

 __declspec(naked)

 void YUY2ToYRow_SSE2(const uint8* src_yuy2,

-                     uint8* dst_y, int pix) {

+                     uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]    // src_yuy2

     mov        edx, [esp + 8]    // dst_y

-    mov        ecx, [esp + 12]   // pix

+    mov        ecx, [esp + 12]   // width

     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff

     psrlw      xmm5, 8

@@ -3825,7 +3923,7 @@

 __declspec(naked)

 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   __asm {

     push       esi

     push       edi

@@ -3833,7 +3931,7 @@

     mov        esi, [esp + 8 + 8]    // stride_yuy2

     mov        edx, [esp + 8 + 12]   // dst_u

     mov        edi, [esp + 8 + 16]   // dst_v

-    mov        ecx, [esp + 8 + 20]   // pix

+    mov        ecx, [esp + 8 + 20]   // width

     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

     psrlw      xmm5, 8

     sub        edi, edx

@@ -3868,13 +3966,13 @@

 __declspec(naked)

 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int pix) {

+                         uint8* dst_u, uint8* dst_v, int width) {

   __asm {

     push       edi

     mov        eax, [esp + 4 + 4]    // src_yuy2

     mov        edx, [esp + 4 + 8]    // dst_u

     mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // pix

+    mov        ecx, [esp + 4 + 16]   // width

     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

     psrlw      xmm5, 8

     sub        edi, edx

@@ -3904,11 +4002,11 @@

 __declspec(naked)

 void UYVYToYRow_SSE2(const uint8* src_uyvy,

-                     uint8* dst_y, int pix) {

+                     uint8* dst_y, int width) {

   __asm {

     mov        eax, [esp + 4]    // src_uyvy

     mov        edx, [esp + 8]    // dst_y

-    mov        ecx, [esp + 12]   // pix

+    mov        ecx, [esp + 12]   // width

   convertloop:

     movdqu     xmm0, [eax]

@@ -3927,7 +4025,7 @@

 __declspec(naked)

 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int pix) {

+                      uint8* dst_u, uint8* dst_v, int width) {

   __asm {

     push       esi

     push       edi

@@ -3935,7 +4033,7 @@

     mov        esi, [esp + 8 + 8]    // stride_yuy2

     mov        edx, [esp + 8 + 12]   // dst_u

     mov        edi, [esp + 8 + 16]   // dst_v

-    mov        ecx, [esp + 8 + 20]   // pix

+    mov        ecx, [esp + 8 + 20]   // width

     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

     psrlw      xmm5, 8

     sub        edi, edx

@@ -3970,13 +4068,13 @@

 __declspec(naked)

 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int pix) {

+                         uint8* dst_u, uint8* dst_v, int width) {

   __asm {

     push       edi

     mov        eax, [esp + 4 + 4]    // src_yuy2

     mov        edx, [esp + 4 + 8]    // dst_u

     mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // pix

+    mov        ecx, [esp + 4 + 16]   // width

     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

     psrlw      xmm5, 8

     sub        edi, edx

@@ -4005,93 +4103,123 @@

 #endif  // HAS_YUY2TOYROW_SSE2

-#ifdef HAS_ARGBBLENDROW_SSE2

+#ifdef HAS_BLENDPLANEROW_SSSE3

 // Blend 8 pixels at a time.

+// unsigned version of math

+// =((A2*C2)+(B2*(255-C2))+255)/256

+// signed version of math

+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256

 __declspec(naked)

-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

-                       uint8* dst_argb, int width) {

+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,

+                         const uint8* alpha, uint8* dst, int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_argb0

-    mov        esi, [esp + 4 + 8]   // src_argb1

-    mov        edx, [esp + 4 + 12]  // dst_argb

-    mov        ecx, [esp + 4 + 16]  // width

-    pcmpeqb    xmm7, xmm7       // generate constant 1

-    psrlw      xmm7, 15

-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff

-    psrlw      xmm6, 8

+    push       edi

     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00

     psllw      xmm5, 8

-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000

-    pslld      xmm4, 24

-    sub        ecx, 4

-    jl         convertloop4b    // less than 4 pixels?

+    mov        eax, 0x80808080  // 128 for biasing image to signed.

+    movd       xmm6, eax

+    pshufd     xmm6, xmm6, 0x00

-    // 4 pixel loop.

-  convertloop4:

-    movdqu     xmm3, [eax]      // src argb

-    lea        eax, [eax + 16]

-    movdqa     xmm0, xmm3       // src argb

-    pxor       xmm3, xmm4       // ~alpha

-    movdqu     xmm2, [esi]      // _r_b

-    psrlw      xmm3, 8          // alpha

-    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words

-    pshuflw    xmm3, xmm3, 0F5h

-    pand       xmm2, xmm6       // _r_b

-    paddw      xmm3, xmm7       // 256 - alpha

-    pmullw     xmm2, xmm3       // _r_b * alpha

-    movdqu     xmm1, [esi]      // _a_g

-    lea        esi, [esi + 16]

-    psrlw      xmm1, 8          // _a_g

-    por        xmm0, xmm4       // set alpha to 255

-    pmullw     xmm1, xmm3       // _a_g * alpha

-    psrlw      xmm2, 8          // _r_b convert to 8 bits again

-    paddusb    xmm0, xmm2       // + src argb

-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

-    paddusb    xmm0, xmm1       // + src argb

-    movdqu     [edx], xmm0

-    lea        edx, [edx + 16]

-    sub        ecx, 4

-    jge        convertloop4

+    mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.

+    movd       xmm7, eax

+    pshufd     xmm7, xmm7, 0x00

+    mov        eax, [esp + 8 + 4]   // src0

+    mov        edx, [esp + 8 + 8]   // src1

+    mov        esi, [esp + 8 + 12]  // alpha

+    mov        edi, [esp + 8 + 16]  // dst

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        eax, esi

+    sub        edx, esi

+    sub        edi, esi

-  convertloop4b:

-    add        ecx, 4 - 1

-    jl         convertloop1b

+    // 8 pixel loop.

+  convertloop8:

+    movq       xmm0, qword ptr [esi]        // alpha

+    punpcklbw  xmm0, xmm0

+    pxor       xmm0, xmm5         // a, 255-a

+    movq       xmm1, qword ptr [eax + esi]  // src0

+    movq       xmm2, qword ptr [edx + esi]  // src1

+    punpcklbw  xmm1, xmm2

+    psubb      xmm1, xmm6         // bias src0/1 - 128

+    pmaddubsw  xmm0, xmm1

+    paddw      xmm0, xmm7         // unbias result - 32768 and round.

+    psrlw      xmm0, 8

+    packuswb   xmm0, xmm0

+    movq       qword ptr [edi + esi], xmm0

+    lea        esi, [esi + 8]

+    sub        ecx, 8

+    jg         convertloop8

-    // 1 pixel loop.

-  convertloop1:

-    movd       xmm3, [eax]      // src argb

-    lea        eax, [eax + 4]

-    movdqa     xmm0, xmm3       // src argb

-    pxor       xmm3, xmm4       // ~alpha

-    movd       xmm2, [esi]      // _r_b

-    psrlw      xmm3, 8          // alpha

-    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words

-    pshuflw    xmm3, xmm3, 0F5h

-    pand       xmm2, xmm6       // _r_b

-    paddw      xmm3, xmm7       // 256 - alpha

-    pmullw     xmm2, xmm3       // _r_b * alpha

-    movd       xmm1, [esi]      // _a_g

-    lea        esi, [esi + 4]

-    psrlw      xmm1, 8          // _a_g

-    por        xmm0, xmm4       // set alpha to 255

-    pmullw     xmm1, xmm3       // _a_g * alpha

-    psrlw      xmm2, 8          // _r_b convert to 8 bits again

-    paddusb    xmm0, xmm2       // + src argb

-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

-    paddusb    xmm0, xmm1       // + src argb

-    movd       [edx], xmm0

-    lea        edx, [edx + 4]

-    sub        ecx, 1

-    jge        convertloop1

-  convertloop1b:

+    pop        edi

     pop        esi

ret

-#endif  // HAS_ARGBBLENDROW_SSE2

+#endif  // HAS_BLENDPLANEROW_SSSE3

+#ifdef HAS_BLENDPLANEROW_AVX2

+// Blend 32 pixels at a time.

+// unsigned version of math

+// =((A2*C2)+(B2*(255-C2))+255)/256

+// signed version of math

+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256

+__declspec(naked)

+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,

+                         const uint8* alpha, uint8* dst, int width) {

+  __asm {

+    push        esi

+    push        edi

+    vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00

+    vpsllw      ymm5, ymm5, 8

+    mov         eax, 0x80808080  // 128 for biasing image to signed.

+    vmovd       xmm6, eax

+    vbroadcastss ymm6, xmm6

+    mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.

+    vmovd       xmm7, eax

+    vbroadcastss ymm7, xmm7

+    mov         eax, [esp + 8 + 4]   // src0

+    mov         edx, [esp + 8 + 8]   // src1

+    mov         esi, [esp + 8 + 12]  // alpha

+    mov         edi, [esp + 8 + 16]  // dst

+    mov         ecx, [esp + 8 + 20]  // width

+    sub         eax, esi

+    sub         edx, esi

+    sub         edi, esi

+    // 32 pixel loop.

+  convertloop32:

+    vmovdqu     ymm0, [esi]        // alpha

+    vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31

+    vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23

+    vpxor       ymm3, ymm3, ymm5   // a, 255-a

+    vpxor       ymm0, ymm0, ymm5   // a, 255-a

+    vmovdqu     ymm1, [eax + esi]  // src0

+    vmovdqu     ymm2, [edx + esi]  // src1

+    vpunpckhbw  ymm4, ymm1, ymm2

+    vpunpcklbw  ymm1, ymm1, ymm2

+    vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128

+    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128

+    vpmaddubsw  ymm3, ymm3, ymm4

+    vpmaddubsw  ymm0, ymm0, ymm1

+    vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.

+    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.

+    vpsrlw      ymm3, ymm3, 8

+    vpsrlw      ymm0, ymm0, 8

+    vpackuswb   ymm0, ymm0, ymm3

+    vmovdqu     [edi + esi], ymm0

+    lea         esi, [esi + 32]

+    sub         ecx, 32

+    jg          convertloop32

+    pop         edi

+    pop         esi

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_BLENDPLANEROW_AVX2

 #ifdef HAS_ARGBBLENDROW_SSSE3

 // Shuffle table for isolating alpha.

 static const uvec8 kShuffleAlpha = {

@@ -4098,14 +4226,8 @@

   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,

   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80

};

-// Same as SSE2, but replaces:

-//    psrlw      xmm3, 8          // alpha

-//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words

-//    pshuflw    xmm3, xmm3, 0F5h

-// with..

-//    pshufb     xmm3, kShuffleAlpha // alpha

-// Blend 8 pixels at a time.

+// Blend 8 pixels at a time.

 __declspec(naked)

 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,

                         uint8* dst_argb, int width) {

@@ -4133,7 +4255,7 @@

     movdqa     xmm0, xmm3       // src argb

     pxor       xmm3, xmm4       // ~alpha

     movdqu     xmm2, [esi]      // _r_b

-    pshufb     xmm3, kShuffleAlpha // alpha

+    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha

     pand       xmm2, xmm6       // _r_b

     paddw      xmm3, xmm7       // 256 - alpha

     pmullw     xmm2, xmm3       // _r_b * alpha

@@ -4162,7 +4284,7 @@

     movdqa     xmm0, xmm3       // src argb

     pxor       xmm3, xmm4       // ~alpha

     movd       xmm2, [esi]      // _r_b

-    pshufb     xmm3, kShuffleAlpha // alpha

+    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha

     pand       xmm2, xmm6       // _r_b

     paddw      xmm3, xmm7       // 256 - alpha

     pmullw     xmm2, xmm3       // _r_b * alpha

@@ -4187,48 +4309,6 @@

 #endif  // HAS_ARGBBLENDROW_SSSE3

-#ifdef HAS_ARGBATTENUATEROW_SSE2

-// Attenuate 4 pixels at a time.

-__declspec(naked)

-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {

-  __asm {

-    mov        eax, [esp + 4]   // src_argb0

-    mov        edx, [esp + 8]   // dst_argb

-    mov        ecx, [esp + 12]  // width

-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000

-    pslld      xmm4, 24

-    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff

-    psrld      xmm5, 8

- convertloop:

-    movdqu     xmm0, [eax]      // read 4 pixels

-    punpcklbw  xmm0, xmm0       // first 2

-    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words

-    pshuflw    xmm2, xmm2, 0FFh

-    pmulhuw    xmm0, xmm2       // rgb * a

-    movdqu     xmm1, [eax]      // read 4 pixels

-    punpckhbw  xmm1, xmm1       // next 2 pixels

-    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words

-    pshuflw    xmm2, xmm2, 0FFh

-    pmulhuw    xmm1, xmm2       // rgb * a

-    movdqu     xmm2, [eax]      // alphas

-    lea        eax, [eax + 16]

-    psrlw      xmm0, 8

-    pand       xmm2, xmm4

-    psrlw      xmm1, 8

-    packuswb   xmm0, xmm1

-    pand       xmm0, xmm5       // keep original alphas

-    por        xmm0, xmm2

-    movdqu     [edx], xmm0

-    lea        edx, [edx + 16]

-    sub        ecx, 4

-    jg         convertloop

-    ret

-  }

-}

-#endif  // HAS_ARGBATTENUATEROW_SSE2

 #ifdef HAS_ARGBATTENUATEROW_SSSE3

 // Shuffle table duplicating alpha.

 static const uvec8 kShuffleAlpha0 = {

@@ -4246,8 +4326,8 @@

     mov        ecx, [esp + 12]  // width

     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000

     pslld      xmm3, 24

-    movdqa     xmm4, kShuffleAlpha0

-    movdqa     xmm5, kShuffleAlpha1

+    movdqa     xmm4, xmmword ptr kShuffleAlpha0

+    movdqa     xmm5, xmmword ptr kShuffleAlpha1

  convertloop:

     movdqu     xmm0, [eax]      // read 4 pixels

@@ -4289,7 +4369,7 @@

     mov        edx, [esp + 8]   // dst_argb

     mov        ecx, [esp + 12]  // width

     sub        edx, eax

-    vbroadcastf128 ymm4,kShuffleAlpha_AVX2

+    vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2

     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000

     vpslld     ymm5, ymm5, 24

@@ -4323,11 +4403,13 @@

 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,

                              int width) {

   __asm {

+    push       ebx

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // src_argb0

-    mov        edx, [esp + 8 + 8]   // dst_argb

-    mov        ecx, [esp + 8 + 12]  // width

+    mov        eax, [esp + 12 + 4]   // src_argb

+    mov        edx, [esp + 12 + 8]   // dst_argb

+    mov        ecx, [esp + 12 + 12]  // width

+    lea        ebx, fixed_invtbl8

  convertloop:

     movdqu     xmm0, [eax]      // read 4 pixels

@@ -4334,8 +4416,8 @@

     movzx      esi, byte ptr [eax + 3]  // first alpha

     movzx      edi, byte ptr [eax + 7]  // second alpha

     punpcklbw  xmm0, xmm0       // first 2

-    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]

-    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]

+    movd       xmm2, dword ptr [ebx + esi * 4]

+    movd       xmm3, dword ptr [ebx + edi * 4]

     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a

     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words

     movlhps    xmm2, xmm3

@@ -4345,21 +4427,22 @@

     movzx      esi, byte ptr [eax + 11]  // third alpha

     movzx      edi, byte ptr [eax + 15]  // forth alpha

     punpckhbw  xmm1, xmm1       // next 2

-    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]

-    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]

+    movd       xmm2, dword ptr [ebx + esi * 4]

+    movd       xmm3, dword ptr [ebx + edi * 4]

     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words

     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words

     movlhps    xmm2, xmm3

     pmulhuw    xmm1, xmm2       // rgb * a

     lea        eax, [eax + 16]

     packuswb   xmm0, xmm1

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

     sub        ecx, 4

     jg         convertloop

     pop        edi

     pop        esi

+    pop        ebx

ret

@@ -4381,7 +4464,7 @@

     mov        edx, [esp + 8]   // dst_argb

     mov        ecx, [esp + 12]  // width

     sub        edx, eax

-    vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2

+    vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2

  convertloop:

     vmovdqu    ymm6, [eax]       // read 8 pixels.

@@ -4412,36 +4495,37 @@

                              int width) {

   __asm {

-    mov        eax, [esp + 4]   // src_argb0

-    mov        edx, [esp + 8]   // dst_argb

-    mov        ecx, [esp + 12]  // width

-    sub        edx, eax

-    vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2

+    push       ebx

     push       esi

     push       edi

+    mov        eax, [esp + 12 + 4]   // src_argb

+    mov        edx, [esp + 12 + 8]   // dst_argb

+    mov        ecx, [esp + 12 + 12]  // width

+    sub        edx, eax

+    lea        ebx, fixed_invtbl8

+    vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2

  convertloop:

     // replace VPGATHER

     movzx      esi, byte ptr [eax + 3]                 // alpha0

     movzx      edi, byte ptr [eax + 7]                 // alpha1

-    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]

-    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]

+    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]

+    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]

     movzx      esi, byte ptr [eax + 11]                // alpha2

     movzx      edi, byte ptr [eax + 15]                // alpha3

     vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]

-    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]

-    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]

-    movzx      esi, byte ptr [eax + 19]                // alpha4

+    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]

+    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]

+    movzx      esi, byte ptr [eax + 19]                // alpha4

     movzx      edi, byte ptr [eax + 23]                // alpha5

     vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]

-    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]

-    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]

+    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]

+    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]

     movzx      esi, byte ptr [eax + 27]                // alpha6

     movzx      edi, byte ptr [eax + 31]                // alpha7

     vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]

-    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]

-    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]

+    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]

+    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]

     vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]

     vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]

     vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]

@@ -4465,6 +4549,7 @@

     pop        edi

     pop        esi

+    pop        ebx

     vzeroupper

ret

@@ -4480,8 +4565,8 @@

     mov        eax, [esp + 4]   /* src_argb */

     mov        edx, [esp + 8]   /* dst_argb */

     mov        ecx, [esp + 12]  /* width */

-    movdqa     xmm4, kARGBToYJ

-    movdqa     xmm5, kAddYJ64

+    movdqa     xmm4, xmmword ptr kARGBToYJ

+    movdqa     xmm5, xmmword ptr kAddYJ64

  convertloop:

     movdqu     xmm0, [eax]  // G

@@ -4538,9 +4623,9 @@

   __asm {

     mov        eax, [esp + 4]   /* dst_argb */

     mov        ecx, [esp + 8]   /* width */

-    movdqa     xmm2, kARGBToSepiaB

-    movdqa     xmm3, kARGBToSepiaG

-    movdqa     xmm4, kARGBToSepiaR

+    movdqa     xmm2, xmmword ptr kARGBToSepiaB

+    movdqa     xmm3, xmmword ptr kARGBToSepiaG

+    movdqa     xmm4, xmmword ptr kARGBToSepiaR

  convertloop:

     movdqu     xmm0, [eax]  // B

@@ -5190,6 +5275,7 @@

 // dst points to pixel to store result to.

 // count is number of averaged pixels to produce.

 // Does 4 pixels at a time.

+// This function requires alignment on accumulation buffer pointers.

 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,

                                     int width, int area, uint8* dst,

                                     int count) {

@@ -5517,36 +5603,38 @@

     mov        edx, [esp + 8 + 12]  // src_stride

     mov        ecx, [esp + 8 + 16]  // dst_width

     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

-    shr        eax, 1

     // Dispatch to specialized filters if applicable.

     cmp        eax, 0

-    je         xloop100  // 0 / 128.  Blend 100 / 0.

+    je         xloop100  // 0 / 256.  Blend 100 / 0.

     sub        edi, esi

-    cmp        eax, 32

-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.

-    cmp        eax, 64

-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.

-    cmp        eax, 96

-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.

+    cmp        eax, 128

+    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.

-    vmovd      xmm0, eax  // high fraction 0..127

+    vmovd      xmm0, eax  // high fraction 0..255

     neg        eax

-    add        eax, 128

-    vmovd      xmm5, eax  // low fraction 128..1

+    add        eax, 256

+    vmovd      xmm5, eax  // low fraction 256..1

     vpunpcklbw xmm5, xmm5, xmm0

     vpunpcklwd xmm5, xmm5, xmm5

-    vpxor      ymm0, ymm0, ymm0

-    vpermd     ymm5, ymm0, ymm5

+    vbroadcastss ymm5, xmm5

+    mov        eax, 0x80808080  // 128b for bias and rounding.

+    vmovd      xmm4, eax

+    vbroadcastss ymm4, xmm4

   xloop:

     vmovdqu    ymm0, [esi]

     vmovdqu    ymm2, [esi + edx]

     vpunpckhbw ymm1, ymm0, ymm2  // mutates

-    vpunpcklbw ymm0, ymm0, ymm2  // mutates

-    vpmaddubsw ymm0, ymm0, ymm5

-    vpmaddubsw ymm1, ymm1, ymm5

-    vpsrlw     ymm0, ymm0, 7

-    vpsrlw     ymm1, ymm1, 7

+    vpunpcklbw ymm0, ymm0, ymm2

+    vpsubb     ymm1, ymm1, ymm4  // bias to signed image

+    vpsubb     ymm0, ymm0, ymm4

+    vpmaddubsw ymm1, ymm5, ymm1

+    vpmaddubsw ymm0, ymm5, ymm0

+    vpaddw     ymm1, ymm1, ymm4  // unbias and round

+    vpaddw     ymm0, ymm0, ymm4

+    vpsrlw     ymm1, ymm1, 8

+    vpsrlw     ymm0, ymm0, 8

     vpackuswb  ymm0, ymm0, ymm1  // unmutates

     vmovdqu    [esi + edi], ymm0

     lea        esi, [esi + 32]

@@ -5554,18 +5642,6 @@

     jg         xloop

     jmp        xloop99

-   // Blend 25 / 75.

- xloop25:

-   vmovdqu    ymm0, [esi]

-   vmovdqu    ymm1, [esi + edx]

-   vpavgb     ymm0, ymm0, ymm1

-   vpavgb     ymm0, ymm0, ymm1

-   vmovdqu    [esi + edi], ymm0

-   lea        esi, [esi + 32]

-   sub        ecx, 32

-   jg         xloop25

-   jmp        xloop99

    // Blend 50 / 50.

  xloop50:

    vmovdqu    ymm0, [esi]

@@ -5576,18 +5652,6 @@

    jg         xloop50

    jmp        xloop99

-   // Blend 75 / 25.

- xloop75:

-   vmovdqu    ymm1, [esi]

-   vmovdqu    ymm0, [esi + edx]

-   vpavgb     ymm0, ymm0, ymm1

-   vpavgb     ymm0, ymm0, ymm1

-   vmovdqu    [esi + edi], ymm0

-   lea        esi, [esi + 32]

-   sub        ecx, 32

-   jg         xloop75

-   jmp        xloop99

    // Blend 100 / 0 - Copy row unchanged.

  xloop100:

    rep movsb

@@ -5602,6 +5666,7 @@

 #endif  // HAS_INTERPOLATEROW_AVX2

 // Bilinear filter 16x2 -> 16x1

+// TODO(fbarchard): Consider allowing 256 using memcpy.

 __declspec(naked)

 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

                           ptrdiff_t src_stride, int dst_width,

@@ -5609,6 +5674,7 @@

   __asm {

     push       esi

     push       edi

     mov        edi, [esp + 8 + 4]   // dst_ptr

     mov        esi, [esp + 8 + 8]   // src_ptr

     mov        edx, [esp + 8 + 12]  // src_stride

@@ -5615,24 +5681,22 @@

     mov        ecx, [esp + 8 + 16]  // dst_width

     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

     sub        edi, esi

-    shr        eax, 1

     // Dispatch to specialized filters if applicable.

     cmp        eax, 0

-    je         xloop100  // 0 / 128.  Blend 100 / 0.

-    cmp        eax, 32

-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.

-    cmp        eax, 64

-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.

-    cmp        eax, 96

-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.

+    je         xloop100  // 0 /256.  Blend 100 / 0.

+    cmp        eax, 128

+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.

-    movd       xmm0, eax  // high fraction 0..127

+    movd       xmm0, eax  // high fraction 0..255

     neg        eax

-    add        eax, 128

-    movd       xmm5, eax  // low fraction 128..1

+    add        eax, 256

+    movd       xmm5, eax  // low fraction 255..1

     punpcklbw  xmm5, xmm0

     punpcklwd  xmm5, xmm5

     pshufd     xmm5, xmm5, 0

+    mov        eax, 0x80808080  // 128 for biasing image to signed.

+    movd       xmm4, eax

+    pshufd     xmm4, xmm4, 0x00

   xloop:

     movdqu     xmm0, [esi]

@@ -5640,29 +5704,23 @@

     movdqu     xmm1, xmm0

     punpcklbw  xmm0, xmm2

     punpckhbw  xmm1, xmm2

-    pmaddubsw  xmm0, xmm5

-    pmaddubsw  xmm1, xmm5

-    psrlw      xmm0, 7

-    psrlw      xmm1, 7

-    packuswb   xmm0, xmm1

-    movdqu     [esi + edi], xmm0

+    psubb      xmm0, xmm4  // bias image by -128

+    psubb      xmm1, xmm4

+    movdqa     xmm2, xmm5

+    movdqa     xmm3, xmm5

+    pmaddubsw  xmm2, xmm0

+    pmaddubsw  xmm3, xmm1

+    paddw      xmm2, xmm4

+    paddw      xmm3, xmm4

+    psrlw      xmm2, 8

+    psrlw      xmm3, 8

+    packuswb   xmm2, xmm3

+    movdqu     [esi + edi], xmm2

     lea        esi, [esi + 16]

     sub        ecx, 16

     jg         xloop

     jmp        xloop99

-    // Blend 25 / 75.

-  xloop25:

-    movdqu     xmm0, [esi]

-    movdqu     xmm1, [esi + edx]

-    pavgb      xmm0, xmm1

-    pavgb      xmm0, xmm1

-    movdqu     [esi + edi], xmm0

-    lea        esi, [esi + 16]

-    sub        ecx, 16

-    jg         xloop25

-    jmp        xloop99

     // Blend 50 / 50.

   xloop50:

     movdqu     xmm0, [esi]

@@ -5674,18 +5732,6 @@

     jg         xloop50

     jmp        xloop99

-    // Blend 75 / 25.

-  xloop75:

-    movdqu     xmm1, [esi]

-    movdqu     xmm0, [esi + edx]

-    pavgb      xmm0, xmm1

-    pavgb      xmm0, xmm1

-    movdqu     [esi + edi], xmm0

-    lea        esi, [esi + 16]

-    sub        ecx, 16

-    jg         xloop75

-    jmp        xloop99

     // Blend 100 / 0 - Copy row unchanged.

   xloop100:

     movdqu     xmm0, [esi]

@@ -5701,124 +5747,16 @@

-#ifdef HAS_INTERPOLATEROW_SSE2

-// Bilinear filter 16x2 -> 16x1

-__declspec(naked)

-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,

-                         ptrdiff_t src_stride, int dst_width,

-                         int source_y_fraction) {

-  __asm {

-    push       esi

-    push       edi

-    mov        edi, [esp + 8 + 4]   // dst_ptr

-    mov        esi, [esp + 8 + 8]   // src_ptr

-    mov        edx, [esp + 8 + 12]  // src_stride

-    mov        ecx, [esp + 8 + 16]  // dst_width

-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

-    sub        edi, esi

-    // Dispatch to specialized filters if applicable.

-    cmp        eax, 0

-    je         xloop100  // 0 / 256.  Blend 100 / 0.

-    cmp        eax, 64

-    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.

-    cmp        eax, 128

-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.

-    cmp        eax, 192

-    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.

-    movd       xmm5, eax            // xmm5 = y fraction

-    punpcklbw  xmm5, xmm5

-    psrlw      xmm5, 1

-    punpcklwd  xmm5, xmm5

-    punpckldq  xmm5, xmm5

-    punpcklqdq xmm5, xmm5

-    pxor       xmm4, xmm4

-  xloop:

-    movdqu     xmm0, [esi]  // row0

-    movdqu     xmm2, [esi + edx]  // row1

-    movdqu     xmm1, xmm0

-    movdqu     xmm3, xmm2

-    punpcklbw  xmm2, xmm4

-    punpckhbw  xmm3, xmm4

-    punpcklbw  xmm0, xmm4

-    punpckhbw  xmm1, xmm4

-    psubw      xmm2, xmm0  // row1 - row0

-    psubw      xmm3, xmm1

-    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16

-    paddw      xmm3, xmm3

-    pmulhw     xmm2, xmm5  // scale diff

-    pmulhw     xmm3, xmm5

-    paddw      xmm0, xmm2  // sum rows

-    paddw      xmm1, xmm3

-    packuswb   xmm0, xmm1

-    movdqu     [esi + edi], xmm0

-    lea        esi, [esi + 16]

-    sub        ecx, 16

-    jg         xloop

-    jmp        xloop99

-    // Blend 25 / 75.

-  xloop25:

-    movdqu     xmm0, [esi]

-    movdqu     xmm1, [esi + edx]

-    pavgb      xmm0, xmm1

-    pavgb      xmm0, xmm1

-    movdqu     [esi + edi], xmm0

-    lea        esi, [esi + 16]

-    sub        ecx, 16

-    jg         xloop25

-    jmp        xloop99

-    // Blend 50 / 50.

-  xloop50:

-    movdqu     xmm0, [esi]

-    movdqu     xmm1, [esi + edx]

-    pavgb      xmm0, xmm1

-    movdqu     [esi + edi], xmm0

-    lea        esi, [esi + 16]

-    sub        ecx, 16

-    jg         xloop50

-    jmp        xloop99

-    // Blend 75 / 25.

-  xloop75:

-    movdqu     xmm1, [esi]

-    movdqu     xmm0, [esi + edx]

-    pavgb      xmm0, xmm1

-    pavgb      xmm0, xmm1

-    movdqu     [esi + edi], xmm0

-    lea        esi, [esi + 16]

-    sub        ecx, 16

-    jg         xloop75

-    jmp        xloop99

-    // Blend 100 / 0 - Copy row unchanged.

-  xloop100:

-    movdqu     xmm0, [esi]

-    movdqu     [esi + edi], xmm0

-    lea        esi, [esi + 16]

-    sub        ecx, 16

-    jg         xloop100

-  xloop99:

-    pop        edi

-    pop        esi

-    ret

-  }

-}

-#endif  // HAS_INTERPOLATEROW_SSE2

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

 __declspec(naked)

 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                          const uint8* shuffler, int pix) {

+                          const uint8* shuffler, int width) {

   __asm {

     mov        eax, [esp + 4]    // src_argb

     mov        edx, [esp + 8]    // dst_argb

     mov        ecx, [esp + 12]   // shuffler

     movdqu     xmm5, [ecx]

-    mov        ecx, [esp + 16]   // pix

+    mov        ecx, [esp + 16]   // width

   wloop:

     movdqu     xmm0, [eax]

@@ -5838,13 +5776,13 @@

 #ifdef HAS_ARGBSHUFFLEROW_AVX2

 __declspec(naked)

 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int pix) {

+                         const uint8* shuffler, int width) {

   __asm {

     mov        eax, [esp + 4]     // src_argb

     mov        edx, [esp + 8]     // dst_argb

     mov        ecx, [esp + 12]    // shuffler

     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.

-    mov        ecx, [esp + 16]    // pix

+    mov        ecx, [esp + 16]    // width

   wloop:

     vmovdqu    ymm0, [eax]

@@ -5866,7 +5804,7 @@

 __declspec(naked)

 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int pix) {

+                         const uint8* shuffler, int width) {

   __asm {

     push       ebx

     push       esi

@@ -5873,7 +5811,7 @@

     mov        eax, [esp + 8 + 4]    // src_argb

     mov        edx, [esp + 8 + 8]    // dst_argb

     mov        esi, [esp + 8 + 12]   // shuffler

-    mov        ecx, [esp + 8 + 16]   // pix

+    mov        ecx, [esp + 8 + 16]   // width

     pxor       xmm5, xmm5

     mov        ebx, [esi]   // shuffler

@@ -6245,7 +6183,7 @@

     // 4 pixel loop.

   convertloop:

-    movdqu     xmm0, qword ptr [eax]      // generate luma ptr

+    movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr

     pmaddubsw  xmm0, xmm3

     phaddw     xmm0, xmm0

     pand       xmm0, xmm4  // mask out low bits

--- a/third_party/libyuv/source/row_x86.asm

+++ /dev/null

@@ -1,146 +1,0 @@

-;

-; Copyright 2012 The LibYuv Project Authors. All rights reserved.

-;

-; Use of this source code is governed by a BSD-style license

-; that can be found in the LICENSE file in the root of the source

-; tree. An additional intellectual property rights grant can be found

-; in the file PATENTS. All contributing project authors may

-; be found in the AUTHORS file in the root of the source tree.

-;

-%ifdef __YASM_VERSION_ID__

-%if __YASM_VERSION_ID__ < 01020000h

-%error AVX2 is supported only by yasm 1.2.0 or later.

-%endif

-%endif

-%include "x86inc.asm"

-SECTION .text

-; cglobal numeric constants are parameters, gpr regs, mm regs

-; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)

-%macro YUY2TOYROW 2-3

-cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix

-%ifidn %1,YUY2

-    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff

-    psrlw      m2, m2, 8

-%endif

-    ALIGN      4

-.convertloop:

-    mov%2      m0, [src_yuy2q]

-    mov%2      m1, [src_yuy2q + mmsize]

-    lea        src_yuy2q, [src_yuy2q + mmsize * 2]

-%ifidn %1,YUY2

-    pand       m0, m0, m2   ; YUY2 even bytes are Y

-    pand       m1, m1, m2

-%else

-    psrlw      m0, m0, 8    ; UYVY odd bytes are Y

-    psrlw      m1, m1, 8

-%endif

-    packuswb   m0, m0, m1

-%if cpuflag(AVX2)

-    vpermq     m0, m0, 0xd8

-%endif

-    sub        pixd, mmsize

-    mov%2      [dst_yq], m0

-    lea        dst_yq, [dst_yq + mmsize]

-    jg         .convertloop

-    REP_RET

-%endmacro

-; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.

-INIT_MMX MMX

-YUY2TOYROW YUY2,a,

-YUY2TOYROW YUY2,u,_Unaligned

-YUY2TOYROW UYVY,a,

-YUY2TOYROW UYVY,u,_Unaligned

-INIT_XMM SSE2

-YUY2TOYROW YUY2,a,

-YUY2TOYROW YUY2,u,_Unaligned

-YUY2TOYROW UYVY,a,

-YUY2TOYROW UYVY,u,_Unaligned

-INIT_YMM AVX2

-YUY2TOYROW YUY2,a,

-YUY2TOYROW UYVY,a,

-; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)

-%macro SplitUVRow 1-2

-cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix

-    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff

-    psrlw      m4, m4, 8

-    sub        dst_vq, dst_uq

-    ALIGN      4

-.convertloop:

-    mov%1      m0, [src_uvq]

-    mov%1      m1, [src_uvq + mmsize]

-    lea        src_uvq, [src_uvq + mmsize * 2]

-    psrlw      m2, m0, 8         ; odd bytes

-    psrlw      m3, m1, 8

-    pand       m0, m0, m4        ; even bytes

-    pand       m1, m1, m4

-    packuswb   m0, m0, m1

-    packuswb   m2, m2, m3

-%if cpuflag(AVX2)

-    vpermq     m0, m0, 0xd8

-    vpermq     m2, m2, 0xd8

-%endif

-    mov%1      [dst_uq], m0

-    mov%1      [dst_uq + dst_vq], m2

-    lea        dst_uq, [dst_uq + mmsize]

-    sub        pixd, mmsize

-    jg         .convertloop

-    REP_RET

-%endmacro

-INIT_MMX MMX

-SplitUVRow a,

-SplitUVRow u,_Unaligned

-INIT_XMM SSE2

-SplitUVRow a,

-SplitUVRow u,_Unaligned

-INIT_YMM AVX2

-SplitUVRow a,

-; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

-;                      int width);

-%macro MergeUVRow_ 1-2

-cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix

-    sub        src_vq, src_uq

-    ALIGN      4

-.convertloop:

-    mov%1      m0, [src_uq]

-    mov%1      m1, [src_vq]

-    lea        src_uq, [src_uq + mmsize]

-    punpcklbw  m2, m0, m1       // first 8 UV pairs

-    punpckhbw  m0, m0, m1       // next 8 UV pairs

-%if cpuflag(AVX2)

-    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0

-    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0

-    mov%1      [dst_uvq], m1

-    mov%1      [dst_uvq + mmsize], m2

-%else

-    mov%1      [dst_uvq], m2

-    mov%1      [dst_uvq + mmsize], m0

-%endif

-    lea        dst_uvq, [dst_uvq + mmsize * 2]

-    sub        pixd, mmsize

-    jg         .convertloop

-    REP_RET

-%endmacro

-INIT_MMX MMX

-MergeUVRow_ a,

-MergeUVRow_ u,_Unaligned

-INIT_XMM SSE2

-MergeUVRow_ a,

-MergeUVRow_ u,_Unaligned

-INIT_YMM AVX2

-MergeUVRow_ a,

--- a/third_party/libyuv/source/scale.cc

+++ b/third_party/libyuv/source/scale.cc

@@ -61,15 +61,15 @@

 #endif

-#if defined(HAS_SCALEROWDOWN2_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :

-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :

-        ScaleRowDown2Box_Any_SSE2);

+#if defined(HAS_SCALEROWDOWN2_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :

+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :

+        ScaleRowDown2Box_Any_SSSE3);

     if (IS_ALIGNED(dst_width, 16)) {

-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :

-          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :

-          ScaleRowDown2Box_SSE2);

+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :

+          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :

+          ScaleRowDown2Box_SSSE3);

 #endif

@@ -85,12 +85,12 @@

 #endif

-#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&

+#if defined(HAS_SCALEROWDOWN2_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&

       IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&

       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

     ScaleRowDown2 = filtering ?

-        ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;

+        ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;

 #endif

@@ -135,12 +135,12 @@

         ScaleRowDown2Box_16_SSE2);

 #endif

-#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&

+#if defined(HAS_SCALEROWDOWN2_16_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&

       IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&

       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

     ScaleRowDown2 = filtering ?

-        ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2;

+        ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;

 #endif

@@ -182,12 +182,12 @@

 #endif

-#if defined(HAS_SCALEROWDOWN4_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

+#if defined(HAS_SCALEROWDOWN4_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

     ScaleRowDown4 = filtering ?

-        ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;

+        ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;

     if (IS_ALIGNED(dst_width, 8)) {

-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;

+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;

 #endif

@@ -200,12 +200,12 @@

 #endif

-#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&

+#if defined(HAS_SCALEROWDOWN4_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&

       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

     ScaleRowDown4 = filtering ?

-        ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;

+        ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;

 #endif

@@ -245,12 +245,12 @@

         ScaleRowDown4_16_SSE2;

 #endif

-#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&

+#if defined(HAS_SCALEROWDOWN4_16_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&

       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

     ScaleRowDown4 = filtering ?

-        ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2;

+        ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;

 #endif

@@ -325,16 +325,16 @@

 #endif

-#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&

+#if defined(HAS_SCALEROWDOWN34_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&

       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

     if (!filtering) {

-      ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;

-      ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;

+      ScaleRowDown34_0 = ScaleRowDown34_DSPR2;

+      ScaleRowDown34_1 = ScaleRowDown34_DSPR2;

     } else {

-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;

-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;

+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;

+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;

 #endif

@@ -404,16 +404,16 @@

 #endif

-#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&

+#if defined(HAS_SCALEROWDOWN34_16_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&

       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

     if (!filtering) {

-      ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2;

-      ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2;

+      ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;

+      ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;

     } else {

-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2;

-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2;

+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;

+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;

 #endif

@@ -517,16 +517,16 @@

 #endif

-#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&

+#if defined(HAS_SCALEROWDOWN38_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&

       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

     if (!filtering) {

-      ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;

-      ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;

+      ScaleRowDown38_3 = ScaleRowDown38_DSPR2;

+      ScaleRowDown38_2 = ScaleRowDown38_DSPR2;

     } else {

-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;

-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;

+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;

+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;

 #endif

@@ -595,16 +595,16 @@

 #endif

-#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&

+#if defined(HAS_SCALEROWDOWN38_16_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&

       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

     if (!filtering) {

-      ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2;

-      ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2;

+      ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;

+      ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;

     } else {

-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2;

-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2;

+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;

+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;

 #endif

@@ -659,7 +659,6 @@

   int i;

   int scaletbl[2];

   int minboxwidth = dx >> 16;

-  int* scaleptr = scaletbl - minboxwidth;

   int boxwidth;

   scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);

   scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);

@@ -667,7 +666,8 @@

     int ix = x >> 16;

     x += dx;

     boxwidth = MIN1((x >> 16) - ix);

-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;

+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *

+        scaletbl[boxwidth - minboxwidth] >> 16;

@@ -676,7 +676,6 @@

   int i;

   int scaletbl[2];

   int minboxwidth = dx >> 16;

-  int* scaleptr = scaletbl - minboxwidth;

   int boxwidth;

   scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);

   scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);

@@ -684,8 +683,8 @@

     int ix = x >> 16;

     x += dx;

     boxwidth = MIN1((x >> 16) - ix);

-    *dst_ptr++ =

-        SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;

+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *

+        scaletbl[boxwidth - minboxwidth]  >> 16;

@@ -875,14 +874,6 @@

              &x, &y, &dx, &dy);

   src_width = Abs(src_width);

-#if defined(HAS_INTERPOLATEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    InterpolateRow = InterpolateRow_Any_SSE2;

-    if (IS_ALIGNED(src_width, 16)) {

-      InterpolateRow = InterpolateRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -907,11 +898,11 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {

-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;

+#if defined(HAS_INTERPOLATEROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2)) {

+    InterpolateRow = InterpolateRow_Any_DSPR2;

     if (IS_ALIGNED(src_width, 4)) {

-      InterpolateRow = InterpolateRow_MIPS_DSPR2;

+      InterpolateRow = InterpolateRow_DSPR2;

 #endif

@@ -1011,11 +1002,11 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {

-    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;

+#if defined(HAS_INTERPOLATEROW_16_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2)) {

+    InterpolateRow = InterpolateRow_Any_16_DSPR2;

     if (IS_ALIGNED(src_width, 4)) {

-      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;

+      InterpolateRow = InterpolateRow_16_DSPR2;

 #endif

@@ -1072,14 +1063,6 @@

              &x, &y, &dx, &dy);

   src_width = Abs(src_width);

-#if defined(HAS_INTERPOLATEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    InterpolateRow = InterpolateRow_Any_SSE2;

-    if (IS_ALIGNED(dst_width, 16)) {

-      InterpolateRow = InterpolateRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -1104,11 +1087,11 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {

-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;

+#if defined(HAS_INTERPOLATEROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2)) {

+    InterpolateRow = InterpolateRow_Any_DSPR2;

     if (IS_ALIGNED(dst_width, 4)) {

-      InterpolateRow = InterpolateRow_MIPS_DSPR2;

+      InterpolateRow = InterpolateRow_DSPR2;

 #endif

@@ -1243,11 +1226,11 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {

-    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;

+#if defined(HAS_INTERPOLATEROW_16_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2)) {

+    InterpolateRow = InterpolateRow_Any_16_DSPR2;

     if (IS_ALIGNED(dst_width, 4)) {

-      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;

+      InterpolateRow = InterpolateRow_16_DSPR2;

 #endif

--- a/third_party/libyuv/source/scale_any.cc

+++ b/third_party/libyuv/source/scale_any.cc

@@ -55,12 +55,29 @@

                      dst_ptr + n * BPP, r);                                    \

-#ifdef HAS_SCALEROWDOWN2_SSE2

-SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)

-SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,

+// Fixed scale down for odd source width.  Used by I420Blend subsampling.

+// Since dst_width is (width + 1) / 2, this function scales one less pixel

+// and copies the last pixel.

+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \

+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \

+                 uint8* dst_ptr, int dst_width) {                              \

+      int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1));               \

+      int n = dst_width - r;                                                   \

+      if (n > 0) {                                                             \

+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \

+      }                                                                        \

+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \

+                     dst_ptr + n * BPP, r);                                    \

+    }

+#ifdef HAS_SCALEROWDOWN2_SSSE3

+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)

+SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,

       ScaleRowDown2Linear_C, 2, 1, 15)

-SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,

+SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,

       2, 1, 15)

+SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,

+      ScaleRowDown2Box_Odd_C, 2, 1, 15)

 #endif

 #ifdef HAS_SCALEROWDOWN2_AVX2

 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)

@@ -68,6 +85,8 @@

       ScaleRowDown2Linear_C, 2, 1, 31)

 SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,

       2, 1, 31)

+SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,

+      2, 1, 31)

 #endif

 #ifdef HAS_SCALEROWDOWN2_NEON

 SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)

@@ -75,10 +94,12 @@

       ScaleRowDown2Linear_C, 2, 1, 15)

 SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,

       ScaleRowDown2Box_C, 2, 1, 15)

+SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,

+      ScaleRowDown2Box_Odd_C, 2, 1, 15)

 #endif

-#ifdef HAS_SCALEROWDOWN4_SSE2

-SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)

-SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,

+#ifdef HAS_SCALEROWDOWN4_SSSE3

+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)

+SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,

       4, 1, 7)

 #endif

 #ifdef HAS_SCALEROWDOWN4_AVX2

--- a/third_party/libyuv/source/scale_argb.cc

+++ b/third_party/libyuv/source/scale_argb.cc

@@ -210,14 +210,6 @@

   clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.

   src_argb += xl * 4;

   x -= (int)(xl << 16);

-#if defined(HAS_INTERPOLATEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    InterpolateRow = InterpolateRow_Any_SSE2;

-    if (IS_ALIGNED(clip_src_width, 16)) {

-      InterpolateRow = InterpolateRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -242,12 +234,12 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+#if defined(HAS_INTERPOLATEROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) &&

       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {

-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;

+    InterpolateRow = InterpolateRow_Any_DSPR2;

     if (IS_ALIGNED(clip_src_width, 4)) {

-      InterpolateRow = InterpolateRow_MIPS_DSPR2;

+      InterpolateRow = InterpolateRow_DSPR2;

 #endif

@@ -308,14 +300,6 @@

       int dst_width, int x, int dx) =

       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;

   const int max_y = (src_height - 1) << 16;

-#if defined(HAS_INTERPOLATEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    InterpolateRow = InterpolateRow_Any_SSE2;

-    if (IS_ALIGNED(dst_width, 4)) {

-      InterpolateRow = InterpolateRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -340,10 +324,10 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+#if defined(HAS_INTERPOLATEROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) &&

       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {

-    InterpolateRow = InterpolateRow_MIPS_DSPR2;

+    InterpolateRow = InterpolateRow_DSPR2;

 #endif

   if (src_width >= 32768) {

@@ -481,13 +465,13 @@

 #endif

-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&

+#if defined(HAS_I422TOARGBROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&

       IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

       IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

       IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;

+    I422ToARGBRow = I422ToARGBRow_DSPR2;

 #endif

@@ -494,14 +478,6 @@

   void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,

       ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

       InterpolateRow_C;

-#if defined(HAS_INTERPOLATEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    InterpolateRow = InterpolateRow_Any_SSE2;

-    if (IS_ALIGNED(dst_width, 4)) {

-      InterpolateRow = InterpolateRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -526,10 +502,10 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+#if defined(HAS_INTERPOLATEROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) &&

       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    InterpolateRow = InterpolateRow_MIPS_DSPR2;

+    InterpolateRow = InterpolateRow_DSPR2;

 #endif

@@ -845,6 +821,36 @@

             dst_argb, dst_stride_argb, dst_width, dst_height,

             0, 0, dst_width, dst_height, filtering);

   return 0;

+}

+// Scale with YUV conversion to ARGB and clipping.

+LIBYUV_API

+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,

+                       const uint8* src_u, int src_stride_u,

+                       const uint8* src_v, int src_stride_v,

+                       uint32 src_fourcc,

+                       int src_width, int src_height,

+                       uint8* dst_argb, int dst_stride_argb,

+                       uint32 dst_fourcc,

+                       int dst_width, int dst_height,

+                       int clip_x, int clip_y, int clip_width, int clip_height,

+                       enum FilterMode filtering) {

+  uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);

+  int r;

+  I420ToARGB(src_y, src_stride_y,

+             src_u, src_stride_u,

+             src_v, src_stride_v,

+             argb_buffer, src_width * 4,

+             src_width, src_height);

+  r = ARGBScaleClip(argb_buffer, src_width * 4,

+                    src_width, src_height,

+                    dst_argb, dst_stride_argb,

+                    dst_width, dst_height,

+                    clip_x, clip_y, clip_width, clip_height,

+                    filtering);

+  free(argb_buffer);

+  return r;

 #ifdef __cplusplus

--- a/third_party/libyuv/source/scale_common.cc

+++ b/third_party/libyuv/source/scale_common.cc

@@ -103,6 +103,28 @@

+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst, int dst_width) {

+  const uint8* s = src_ptr;

+  const uint8* t = src_ptr + src_stride;

+  int x;

+  dst_width -= 1;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;

+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;

+    dst += 2;

+    s += 4;

+    t += 4;

+  }

+  if (dst_width & 1) {

+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;

+    dst += 1;

+    s += 2;

+    t += 2;

+  }

+  dst[0] = (s[0] + t[0] + 1) >> 1;

+}

 void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

                            uint16* dst, int dst_width) {

   const uint16* s = src_ptr;

@@ -395,8 +417,16 @@

 // (1-f)a + fb can be replaced with a + f(b-a)

+#if defined(__arm__)

+// arm uses 16 bit math with truncation.

+// TODO(fbarchard): add rounding.

 #define BLENDER(a, b, f) (uint8)((int)(a) + \

-    ((int)(f) * ((int)(b) - (int)(a)) >> 16))

+    (((int)((f)) * ((int)(b) - (int)(a))) >> 16))

+#else

+// inteluses 7 bit math with rounding.

+#define BLENDER(a, b, f) (uint8)((int)(a) + \

+    (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))

+#endif

 void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,

                        int dst_width, int x, int dx) {

@@ -448,8 +478,9 @@

 #undef BLENDER

+// Same as 8 bit arm blender but return is cast to uint16

 #define BLENDER(a, b, f) (uint16)((int)(a) + \

-    ((int)(f) * ((int)(b) - (int)(a)) >> 16))

+    (((int)((f)) * ((int)(b) - (int)(a))) >> 16))

 void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,

                        int dst_width, int x, int dx) {

@@ -787,6 +818,7 @@

+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=605.

 // Mimics SSSE3 blender

 #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7

 #define BLENDERC(a, b, f, s) (uint32)( \

@@ -876,14 +908,6 @@

   assert(dst_width > 0);

   assert(dst_height > 0);

   src_argb += (x >> 16) * bpp;

-#if defined(HAS_INTERPOLATEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    InterpolateRow = InterpolateRow_Any_SSE2;

-    if (IS_ALIGNED(dst_width_bytes, 16)) {

-      InterpolateRow = InterpolateRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -908,13 +932,13 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+#if defined(HAS_INTERPOLATEROW_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) &&

       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&

       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {

-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;

+    InterpolateRow = InterpolateRow_Any_DSPR2;

     if (IS_ALIGNED(dst_width_bytes, 4)) {

-      InterpolateRow = InterpolateRow_MIPS_DSPR2;

+      InterpolateRow = InterpolateRow_DSPR2;

 #endif

@@ -982,13 +1006,13 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)

-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+#if defined(HAS_INTERPOLATEROW_16_DSPR2)

+  if (TestCpuFlag(kCpuHasDSPR2) &&

       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&

       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {

-    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;

+    InterpolateRow = InterpolateRow_Any_16_DSPR2;

     if (IS_ALIGNED(dst_width_bytes, 4)) {

-      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;

+      InterpolateRow = InterpolateRow_16_DSPR2;

 #endif

--- a/third_party/libyuv/source/scale_gcc.cc

+++ b/third_party/libyuv/source/scale_gcc.cc

@@ -9,6 +9,7 @@

*/

 #include "libyuv/row.h"

+#include "libyuv/scale_row.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -16,7 +17,8 @@

 #endif

 // This module is for GCC x86 and x64.

-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))

 // Offsets for source bytes 0 to 9

 static uvec8 kShuf0 =

@@ -96,8 +98,8 @@

 // Generated using gcc disassembly on Visual C object file:

 // objdump -D yuvscaler.obj >yuvscaler.txt

-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) {

+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                         uint8* dst_ptr, int dst_width) {

   asm volatile (

     LABELALIGN

   "1:                                          \n"

@@ -118,11 +120,13 @@

);

-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst_ptr, int dst_width) {

+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width) {

   asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psrlw     $0x8,%%xmm5                     \n"

+    "pcmpeqb    %%xmm4,%%xmm4                  \n"

+    "psrlw      $0xf,%%xmm4                    \n"

+    "packuswb   %%xmm4,%%xmm4                  \n"

+    "pxor       %%xmm5,%%xmm5                  \n"

     LABELALIGN

   "1:                                          \n"

@@ -129,15 +133,11 @@

     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

     "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"

     "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "movdqa    %%xmm1,%%xmm3                   \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "pand      %%xmm5,%%xmm2                   \n"

-    "pand      %%xmm5,%%xmm3                   \n"

-    "pavgw     %%xmm2,%%xmm0                   \n"

-    "pavgw     %%xmm3,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

+    "pmaddubsw  %%xmm4,%%xmm0                  \n"

+    "pmaddubsw  %%xmm4,%%xmm1                  \n"

+    "pavgw      %%xmm5,%%xmm0                  \n"

+    "pavgw      %%xmm5,%%xmm1                  \n"

+    "packuswb   %%xmm1,%%xmm0                  \n"

     "movdqu    %%xmm0," MEMACCESS(1) "         \n"

     "lea       " MEMLEA(0x10,1) ",%1           \n"

     "sub       $0x10,%2                        \n"

@@ -145,15 +145,17 @@

   : "+r"(src_ptr),    // %0

     "+r"(dst_ptr),    // %1

     "+r"(dst_width)   // %2

-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"

+  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"

);

-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width) {

+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst_ptr, int dst_width) {

   asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psrlw     $0x8,%%xmm5                     \n"

+    "pcmpeqb    %%xmm4,%%xmm4                  \n"

+    "psrlw      $0xf,%%xmm4                    \n"

+    "packuswb   %%xmm4,%%xmm4                  \n"

+    "pxor       %%xmm5,%%xmm5                  \n"

     LABELALIGN

   "1:                                          \n"

@@ -162,17 +164,17 @@

     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2

     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3

     "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pavgb     %%xmm2,%%xmm0                   \n"

-    "pavgb     %%xmm3,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "movdqa    %%xmm1,%%xmm3                   \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "pand      %%xmm5,%%xmm2                   \n"

-    "pand      %%xmm5,%%xmm3                   \n"

-    "pavgw     %%xmm2,%%xmm0                   \n"

-    "pavgw     %%xmm3,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

+    "pmaddubsw  %%xmm4,%%xmm0                  \n"

+    "pmaddubsw  %%xmm4,%%xmm1                  \n"

+    "pmaddubsw  %%xmm4,%%xmm2                  \n"

+    "pmaddubsw  %%xmm4,%%xmm3                  \n"

+    "paddw      %%xmm2,%%xmm0                  \n"

+    "paddw      %%xmm3,%%xmm1                  \n"

+    "psrlw      $0x1,%%xmm0                    \n"

+    "psrlw      $0x1,%%xmm1                    \n"

+    "pavgw      %%xmm5,%%xmm0                  \n"

+    "pavgw      %%xmm5,%%xmm1                  \n"

+    "packuswb   %%xmm1,%%xmm0                  \n"

     "movdqu    %%xmm0," MEMACCESS(1) "         \n"

     "lea       " MEMLEA(0x10,1) ",%1           \n"

     "sub       $0x10,%2                        \n"

@@ -186,9 +188,107 @@

);

-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+#ifdef HAS_SCALEROWDOWN2_AVX2

+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

                         uint8* dst_ptr, int dst_width) {

   asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

+    "lea        " MEMLEA(0x40,0) ",%0          \n"

+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"

+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

+    "lea        " MEMLEA(0x20,1) ",%1          \n"

+    "sub        $0x20,%2                       \n"

+    "jg         1b                             \n"

+    "vzeroupper                                \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  :: "memory", "cc", "xmm0", "xmm1"

+  );

+}

+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

+                              uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

+    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"

+    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"

+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

+    "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"

+    "lea        " MEMLEA(0x40,0) ",%0          \n"

+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

+    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"

+    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"

+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

+    "lea        " MEMLEA(0x20,1) ",%1          \n"

+    "sub        $0x20,%2                       \n"

+    "jg         1b                             \n"

+    "vzeroupper                                \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"

+  );

+}

+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

+    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"

+    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"

+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

+    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2

+    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3

+    "lea        " MEMLEA(0x40,0) ",%0          \n"

+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

+    "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"

+    "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"

+    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"

+    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"

+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

+    "lea        " MEMLEA(0x20,1) ",%1          \n"

+    "sub        $0x20,%2                       \n"

+    "jg         1b                             \n"

+    "vzeroupper                                \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  : "r"((intptr_t)(src_stride))   // %3

+  : "memory", "cc", NACL_R14

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  );

+}

+#endif  // HAS_SCALEROWDOWN2_AVX2

+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width) {

+  asm volatile (

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     "psrld     $0x18,%%xmm5                    \n"

     "pslld     $0x10,%%xmm5                    \n"

@@ -214,12 +314,15 @@

);

-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

                            uint8* dst_ptr, int dst_width) {

-  intptr_t stridex3 = 0;

+  intptr_t stridex3;

   asm volatile (

-    "pcmpeqb   %%xmm7,%%xmm7                   \n"

-    "psrlw     $0x8,%%xmm7                     \n"

+    "pcmpeqb    %%xmm4,%%xmm4                  \n"

+    "psrlw      $0xf,%%xmm4                    \n"

+    "movdqa     %%xmm4,%%xmm5                  \n"

+    "packuswb   %%xmm4,%%xmm4                  \n"

+    "psllw      $0x3,%%xmm5                    \n"

     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"

     LABELALIGN

@@ -228,31 +331,29 @@

     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2

     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3

-    "pavgb     %%xmm2,%%xmm0                   \n"

-    "pavgb     %%xmm3,%%xmm1                   \n"

+    "pmaddubsw  %%xmm4,%%xmm0                  \n"

+    "pmaddubsw  %%xmm4,%%xmm1                  \n"

+    "pmaddubsw  %%xmm4,%%xmm2                  \n"

+    "pmaddubsw  %%xmm4,%%xmm3                  \n"

+    "paddw      %%xmm2,%%xmm0                  \n"

+    "paddw      %%xmm3,%%xmm1                  \n"

     MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2

     MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3

-    MEMOPREG(movdqu,0x00,0,3,1,xmm4)           //  movdqu  (%0,%3,1),%%xmm4

-    MEMOPREG(movdqu,0x10,0,3,1,xmm5)           //  movdqu  0x10(%0,%3,1),%%xmm5

+    "pmaddubsw  %%xmm4,%%xmm2                  \n"

+    "pmaddubsw  %%xmm4,%%xmm3                  \n"

+    "paddw      %%xmm2,%%xmm0                  \n"

+    "paddw      %%xmm3,%%xmm1                  \n"

+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2

+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3

     "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pavgb     %%xmm4,%%xmm2                   \n"

-    "pavgb     %%xmm2,%%xmm0                   \n"

-    "pavgb     %%xmm5,%%xmm3                   \n"

-    "pavgb     %%xmm3,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "movdqa    %%xmm1,%%xmm3                   \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "pand      %%xmm7,%%xmm2                   \n"

-    "pand      %%xmm7,%%xmm3                   \n"

-    "pavgw     %%xmm2,%%xmm0                   \n"

-    "pavgw     %%xmm3,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "pand      %%xmm7,%%xmm2                   \n"

-    "pavgw     %%xmm2,%%xmm0                   \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

+    "pmaddubsw  %%xmm4,%%xmm2                  \n"

+    "pmaddubsw  %%xmm4,%%xmm3                  \n"

+    "paddw      %%xmm2,%%xmm0                  \n"

+    "paddw      %%xmm3,%%xmm1                  \n"

+    "phaddw     %%xmm1,%%xmm0                  \n"

+    "paddw      %%xmm5,%%xmm0                  \n"

+    "psrlw      $0x4,%%xmm0                    \n"

+    "packuswb   %%xmm0,%%xmm0                  \n"

     "movq      %%xmm0," MEMACCESS(1) "         \n"

     "lea       " MEMLEA(0x8,1) ",%1            \n"

     "sub       $0x8,%2                         \n"

@@ -260,13 +361,100 @@

   : "+r"(src_ptr),     // %0

     "+r"(dst_ptr),     // %1

     "+r"(dst_width),   // %2

-    "+r"(stridex3)     // %3

+    "=&r"(stridex3)    // %3

   : "r"((intptr_t)(src_stride))    // %4

   : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+#ifdef HAS_SCALEROWDOWN4_AVX2

+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+    "vpsrld     $0x18,%%ymm5,%%ymm5            \n"

+    "vpslld     $0x10,%%ymm5,%%ymm5            \n"

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

+    "lea        " MEMLEA(0x40,0) ",%0          \n"

+    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"

+    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"

+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"

+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"

+    "lea        " MEMLEA(0x10,1) ",%1          \n"

+    "sub        $0x10,%2                       \n"

+    "jg         1b                             \n"

+    "vzeroupper                                \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"

+  );

+}

+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

+    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"

+    "vpsllw     $0x3,%%ymm4,%%ymm5             \n"

+    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

+    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2

+    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3

+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

+    MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2

+    MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3

+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

+    MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2

+    MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3

+    "lea        " MEMLEA(0x40,0) ",%0          \n"

+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"

+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"

+    "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"

+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"

+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"

+    "lea        " MEMLEA(0x10,1) ",%1          \n"

+    "sub        $0x10,%2                       \n"

+    "jg         1b                             \n"

+    "vzeroupper                                \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  : "r"((intptr_t)(src_stride)),  // %3

+    "r"((intptr_t)(src_stride * 3))   // %4

+  : "memory", "cc", NACL_R14

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  );

+}

+#endif  // HAS_SCALEROWDOWN4_AVX2

 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

                           uint8* dst_ptr, int dst_width) {

   asm volatile (

@@ -574,54 +762,79 @@

 // Reads 16xN bytes and produces 16 shorts at a time.

-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                       uint16* dst_ptr, int src_width, int src_height) {

-  int tmp_height = 0;

-  intptr_t tmp_src = 0;

+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {

   asm volatile (

-    "mov       %0,%3                           \n"  // row pointer

-    "mov       %5,%2                           \n"  // height

-    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators

-    "pxor      %%xmm1,%%xmm1                   \n"

-    "pxor      %%xmm4,%%xmm4                   \n"

+    "pxor      %%xmm5,%%xmm5                   \n"

     LABELALIGN

   "1:                                          \n"

-    "movdqu    " MEMACCESS(3) ",%%xmm2         \n"

-    "add       %6,%3                           \n"

-    "movdqa    %%xmm2,%%xmm3                   \n"

-    "punpcklbw %%xmm4,%%xmm2                   \n"

-    "punpckhbw %%xmm4,%%xmm3                   \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16

+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"

+    "movdqa    %%xmm3,%%xmm2                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "punpckhbw %%xmm5,%%xmm3                   \n"

     "paddusw   %%xmm2,%%xmm0                   \n"

     "paddusw   %%xmm3,%%xmm1                   \n"

-    "sub       $0x1,%2                         \n"

-    "jg        1b                              \n"

     "movdqu    %%xmm0," MEMACCESS(1) "         \n"

     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

     "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16

-    "mov       %0,%3                           \n"  // row pointer

-    "mov       %5,%2                           \n"  // height

-    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators

-    "pxor      %%xmm1,%%xmm1                   \n"

-    "sub       $0x10,%4                        \n"

+    "sub       $0x10,%2                        \n"

     "jg        1b                              \n"

   : "+r"(src_ptr),     // %0

     "+r"(dst_ptr),     // %1

-    "+r"(tmp_height),  // %2

-    "+r"(tmp_src),     // %3

-    "+r"(src_width),   // %4

-    "+rm"(src_height)  // %5

-  : "rm"((intptr_t)(src_stride))  // %6

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

+    "+r"(src_width)    // %2

+  :

+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

);

+#ifdef HAS_SCALEADDROW_AVX2

+// Reads 32 bytes and accumulates to 32 shorts at a time.

+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {

+  asm volatile (

+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"

+    "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32

+    "vpermq     $0xd8,%%ymm3,%%ymm3            \n"

+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"

+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"

+    "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"

+    "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"

+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"

+    "lea       " MEMLEA(0x40,1) ",%1           \n"

+    "sub       $0x20,%2                        \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : "+r"(src_ptr),     // %0

+    "+r"(dst_ptr),     // %1

+    "+r"(src_width)    // %2

+  :

+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+  );

+}

+#endif  // HAS_SCALEADDROW_AVX2

+// Constant for making pixels signed to avoid pmaddubsw

+// saturation.

+static uvec8 kFsub80 =

+  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };

+// Constant for making pixels unsigned and adding .5 for rounding.

+static uvec16 kFadd40 =

+  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };

 // Bilinear column filtering. SSSE3 version.

 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

                            int dst_width, int x, int dx) {

-  intptr_t x0 = 0, x1 = 0, temp_pixel = 0;

+  intptr_t x0, x1, temp_pixel;

   asm volatile (

     "movd      %6,%%xmm2                       \n"

     "movd      %7,%%xmm3                       \n"

@@ -628,7 +841,10 @@

     "movl      $0x04040000,%k2                 \n"

     "movd      %k2,%%xmm5                      \n"

     "pcmpeqb   %%xmm6,%%xmm6                   \n"

-    "psrlw     $0x9,%%xmm6                     \n"

+    "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f

+    "pcmpeqb   %%xmm7,%%xmm7                   \n"

+    "psrlw     $15,%%xmm7                      \n"  // 0x00010001

     "pextrw    $0x1,%%xmm2,%k3                 \n"

     "subl      $0x2,%5                         \n"

     "jl        29f                             \n"

@@ -650,16 +866,19 @@

     "movd      %k2,%%xmm4                      \n"

     "pshufb    %%xmm5,%%xmm1                   \n"

     "punpcklwd %%xmm4,%%xmm0                   \n"

-    "pxor      %%xmm6,%%xmm1                   \n"

-    "pmaddubsw %%xmm1,%%xmm0                   \n"

+    "psubb     %8,%%xmm0                       \n"  // make pixels signed.

+    "pxor      %%xmm6,%%xmm1                   \n"  // 128 -f = (f ^ 127 ) + 1

+    "paddusb   %%xmm7,%%xmm1                   \n"

+    "pmaddubsw %%xmm0,%%xmm1                   \n"

     "pextrw    $0x1,%%xmm2,%k3                 \n"

     "pextrw    $0x3,%%xmm2,%k4                 \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "movd      %%xmm0,%k2                      \n"

+    "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.

+    "psrlw     $0x7,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "movd      %%xmm1,%k2                      \n"

     "mov       %w2," MEMACCESS(0) "            \n"

     "lea       " MEMLEA(0x2,0) ",%0            \n"

-    "sub       $0x2,%5                         \n"

+    "subl      $0x2,%5                         \n"

     "jge       2b                              \n"

     LABELALIGN

@@ -670,23 +889,37 @@

     "movd      %k2,%%xmm0                      \n"

     "psrlw     $0x9,%%xmm2                     \n"

     "pshufb    %%xmm5,%%xmm2                   \n"

+    "psubb     %8,%%xmm0                       \n"  // make pixels signed.

     "pxor      %%xmm6,%%xmm2                   \n"

-    "pmaddubsw %%xmm2,%%xmm0                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "movd      %%xmm0,%k2                      \n"

+    "paddusb   %%xmm7,%%xmm2                   \n"

+    "pmaddubsw %%xmm0,%%xmm2                   \n"

+    "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm2                   \n"

+    "movd      %%xmm2,%k2                      \n"

     "mov       %b2," MEMACCESS(0) "            \n"

   "99:                                         \n"

-  : "+r"(dst_ptr),     // %0

-    "+r"(src_ptr),     // %1

-    "+a"(temp_pixel),  // %2

-    "+r"(x0),          // %3

-    "+r"(x1),          // %4

-    "+rm"(dst_width)   // %5

-  : "rm"(x),           // %6

-    "rm"(dx)           // %7

+  : "+r"(dst_ptr),      // %0

+    "+r"(src_ptr),      // %1

+    "=&a"(temp_pixel),  // %2

+    "=&r"(x0),          // %3

+    "=&r"(x1),          // %4

+#if defined(__x86_64__)

+    "+rm"(dst_width)    // %5

+#else

+    "+m"(dst_width)    // %5

+#endif

+  : "rm"(x),            // %6

+    "rm"(dx),           // %7

+#if defined(__x86_64__)

+    "x"(kFsub80),       // %8

+    "x"(kFadd40)        // %9

+#else

+    "m"(kFsub80),       // %8

+    "m"(kFadd40)        // %9

+#endif

   : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

);

@@ -795,7 +1028,7 @@

 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

                                int src_stepx, uint8* dst_argb, int dst_width) {

   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);

-  intptr_t src_stepx_x12 = 0;

+  intptr_t src_stepx_x12;

   asm volatile (

     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"

     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"

@@ -813,11 +1046,11 @@

     "lea       " MEMLEA(0x10,2) ",%2           \n"

     "sub       $0x4,%3                         \n"

     "jg        1b                              \n"

-  : "+r"(src_argb),      // %0

-    "+r"(src_stepx_x4),  // %1

-    "+r"(dst_argb),      // %2

-    "+r"(dst_width),     // %3

-    "+r"(src_stepx_x12)  // %4

+  : "+r"(src_argb),       // %0

+    "+r"(src_stepx_x4),   // %1

+    "+r"(dst_argb),       // %2

+    "+r"(dst_width),      // %3

+    "=&r"(src_stepx_x12)  // %4

   :: "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3"

);

@@ -829,7 +1062,7 @@

                                   ptrdiff_t src_stride, int src_stepx,

                                   uint8* dst_argb, int dst_width) {

   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);

-  intptr_t src_stepx_x12 = 0;

+  intptr_t src_stepx_x12;

   intptr_t row1 = (intptr_t)(src_stride);

   asm volatile (

     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"

@@ -858,12 +1091,12 @@

     "lea       " MEMLEA(0x10,2) ",%2           \n"

     "sub       $0x4,%3                         \n"

     "jg        1b                              \n"

-  : "+r"(src_argb),       // %0

-    "+r"(src_stepx_x4),   // %1

-    "+r"(dst_argb),       // %2

-    "+rm"(dst_width),     // %3

-    "+r"(src_stepx_x12),  // %4

-    "+r"(row1)            // %5

+  : "+r"(src_argb),        // %0

+    "+r"(src_stepx_x4),    // %1

+    "+r"(dst_argb),        // %2

+    "+rm"(dst_width),      // %3

+    "=&r"(src_stepx_x12),  // %4

+    "+r"(row1)             // %5

   :: "memory", "cc", NACL_R14

     "xmm0", "xmm1", "xmm2", "xmm3"

);

@@ -871,7 +1104,7 @@

 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

                         int dst_width, int x, int dx) {

-  intptr_t x0 = 0, x1 = 0;

+  intptr_t x0, x1;

   asm volatile (

     "movd      %5,%%xmm2                       \n"

     "movd      %6,%%xmm3                       \n"

@@ -924,8 +1157,8 @@

     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0

     "movd      %%xmm0," MEMACCESS(2) "         \n"

   "99:                                         \n"

-  : "+a"(x0),          // %0

-    "+d"(x1),          // %1

+  : "=&a"(x0),         // %0

+    "=&d"(x1),         // %1

     "+r"(dst_argb),    // %2

     "+r"(src_argb),    // %3

     "+r"(dst_width)    // %4

@@ -976,7 +1209,7 @@

 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version

 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

                                int dst_width, int x, int dx) {

-  intptr_t x0 = 0, x1 = 0;

+  intptr_t x0, x1;

   asm volatile (

     "movdqa    %0,%%xmm4                       \n"

     "movdqa    %1,%%xmm5                       \n"

@@ -1039,8 +1272,8 @@

   : "+r"(dst_argb),    // %0

     "+r"(src_argb),    // %1

     "+rm"(dst_width),  // %2

-    "+r"(x0),          // %3

-    "+r"(x1)           // %4

+    "=&r"(x0),         // %3

+    "=&r"(x1)          // %4

   : "rm"(x),           // %5

     "rm"(dx)           // %6

   : "memory", "cc", NACL_R14

--- a/third_party/libyuv/source/scale_mips.cc

+++ b/third_party/libyuv/source/scale_mips.cc

@@ -21,8 +21,8 @@

     defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \

     (_MIPS_SIM == _MIPS_SIM_ABI32)

-void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst, int dst_width) {

+void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                         uint8* dst, int dst_width) {

   __asm__ __volatile__(

     ".set push                                     \n"

     ".set noreorder                                \n"

@@ -31,7 +31,6 @@

     "beqz           $t9, 2f                        \n"

     " nop                                          \n"

-    ".p2align       2                              \n"

   "1:                                              \n"

     "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|

     "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|

@@ -78,8 +77,8 @@

);

-void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                 uint8* dst, int dst_width) {

+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst, int dst_width) {

   const uint8* t = src_ptr + src_stride;

   __asm__ __volatile__ (

@@ -90,7 +89,6 @@

     "bltz           $t9, 2f                       \n"

     " nop                                         \n"

-    ".p2align       2                             \n"

   "1:                                             \n"

     "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|

     "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|

@@ -178,8 +176,8 @@

);

-void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst, int dst_width) {

+void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                         uint8* dst, int dst_width) {

   __asm__ __volatile__ (

       ".set push                                    \n"

       ".set noreorder                               \n"

@@ -188,7 +186,6 @@

       "beqz           $t9, 2f                       \n"

       " nop                                         \n"

-      ".p2align       2                             \n"

      "1:                                            \n"

       "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|

       "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|

@@ -234,8 +231,8 @@

);

-void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                 uint8* dst, int dst_width) {

+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst, int dst_width) {

   intptr_t stride = src_stride;

   const uint8* s1 = src_ptr + stride;

   const uint8* s2 = s1 + stride;

@@ -248,7 +245,6 @@

       "srl           $t9, %[dst_width], 1         \n"

       "andi          $t8, %[dst_width], 1         \n"

-      ".p2align      2                            \n"

      "1:                                          \n"

       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|

       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|

@@ -314,12 +310,11 @@

);

-void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst, int dst_width) {

+void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                          uint8* dst, int dst_width) {

   __asm__ __volatile__ (

       ".set push                                          \n"

       ".set noreorder                                     \n"

-      ".p2align        2                                  \n"

     "1:                                                   \n"

       "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|

       "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|

@@ -361,14 +356,13 @@

);

-void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                     uint8* d, int dst_width) {

+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                uint8* d, int dst_width) {

   __asm__ __volatile__ (

       ".set push                                         \n"

       ".set noreorder                                    \n"

       "repl.ph           $t3, 3                          \n"  // 0x00030003

-     ".p2align           2                               \n"

     "1:                                                  \n"

       "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|

       "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|

@@ -418,14 +412,13 @@

);

-void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                     uint8* d, int dst_width) {

+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                uint8* d, int dst_width) {

   __asm__ __volatile__ (

       ".set push                                           \n"

       ".set noreorder                                      \n"

       "repl.ph           $t2, 3                            \n"  // 0x00030003

-      ".p2align          2                                 \n"

     "1:                                                    \n"

       "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|

       "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|

@@ -471,13 +464,12 @@

);

-void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst, int dst_width) {

+void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                          uint8* dst, int dst_width) {

   __asm__ __volatile__ (

       ".set push                                     \n"

       ".set noreorder                                \n"

-      ".p2align   2                                  \n"

     "1:                                              \n"

       "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|

       "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|

@@ -518,8 +510,8 @@

);

-void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                     uint8* dst_ptr, int dst_width) {

+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width) {

   intptr_t stride = src_stride;

   const uint8* t = src_ptr + stride;

   const int c = 0x2AAA;

@@ -528,7 +520,6 @@

       ".set push                                         \n"

       ".set noreorder                                    \n"

-      ".p2align        2                                 \n"

     "1:                                                  \n"

       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|

       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|

@@ -572,9 +563,9 @@

);

-void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,

-                                     ptrdiff_t src_stride,

-                                     uint8* dst_ptr, int dst_width) {

+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width) {

   intptr_t stride = src_stride;

   const uint8* s1 = src_ptr + stride;

   stride += stride;

@@ -586,7 +577,6 @@

       ".set push                                         \n"

       ".set noreorder                                    \n"

-      ".p2align        2                                 \n"

     "1:                                                  \n"

       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|

       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|

--- a/third_party/libyuv/source/scale_neon.cc

+++ b/third_party/libyuv/source/scale_neon.cc

@@ -26,7 +26,6 @@

 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

                         uint8* dst, int dst_width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     // load even pixels into q0, odd into q1

     MEMACCESS(0)

@@ -47,7 +46,6 @@

 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

                            uint8* dst, int dst_width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc

@@ -73,7 +71,6 @@

   asm volatile (

     // change the stride to row 2 pointer

     "add        %1, %0                         \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc

@@ -101,7 +98,6 @@

 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

                         uint8* dst_ptr, int dst_width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0

@@ -123,7 +119,6 @@

   const uint8* src_ptr2 = src_ptr + src_stride * 2;

   const uint8* src_ptr3 = src_ptr + src_stride * 3;

 asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4

@@ -162,7 +157,6 @@

                          ptrdiff_t src_stride,

                          uint8* dst_ptr, int dst_width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0

@@ -185,7 +179,6 @@

   asm volatile (

     "vmov.u8    d24, #3                        \n"

     "add        %3, %0                         \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0

@@ -245,7 +238,6 @@

   asm volatile (

     "vmov.u8    d24, #3                        \n"

     "add        %3, %0                         \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0

@@ -300,7 +292,6 @@

   asm volatile (

     MEMACCESS(3)

     "vld1.8     {q3}, [%3]                     \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"

@@ -334,7 +325,6 @@

     MEMACCESS(7)

     "vld1.8     {q15}, [%7]                    \n"

     "add        %3, %0                         \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     // d0 = 00 40 01 41 02 42 03 43

@@ -450,7 +440,6 @@

     MEMACCESS(5)

     "vld1.8     {q14}, [%5]                    \n"

     "add        %3, %0                         \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     // d0 = 00 40 01 41 02 42 03 43

@@ -543,9 +532,8 @@

 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

                     uint16* dst_ptr, int src_width, int src_height) {

-  const uint8* src_tmp = NULL;

+  const uint8* src_tmp;

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     "mov       %0, %1                          \n"

     "mov       r12, %5                         \n"

@@ -564,12 +552,12 @@

     "add        %1, %1, #16                    \n"

     "subs       %4, %4, #16                    \n"  // 16 processed per loop

     "bgt        1b                             \n"

-  : "+r"(src_tmp),          // %0

-    "+r"(src_ptr),          // %1

-    "+r"(dst_ptr),          // %2

-    "+r"(src_stride),       // %3

-    "+r"(src_width),        // %4

-    "+r"(src_height)        // %5

+  : "=&r"(src_tmp),    // %0

+    "+r"(src_ptr),     // %1

+    "+r"(dst_ptr),     // %2

+    "+r"(src_stride),  // %3

+    "+r"(src_width),   // %4

+    "+r"(src_height)   // %5

   : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List

);

@@ -584,6 +572,10 @@

     MEMACCESS(6)                                               \

     "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"

+// The NEON version mimics this formula:

+// #define BLENDER(a, b, f) (uint8)((int)(a) +

+//    ((int)(f) * ((int)(b) - (int)(a)) >> 16))

 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,

                           int dst_width, int x, int dx) {

   int dx_offset[4] = {0, 1, 2, 3};

@@ -590,7 +582,6 @@

   int* tmp = dx_offset;

   const uint8* src_tmp = src_ptr;

   asm volatile (

-    ".p2align   2                              \n"

     "vdup.32    q0, %3                         \n"  // x

     "vdup.32    q1, %4                         \n"  // dx

     "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3

@@ -749,7 +740,6 @@

 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

                             uint8* dst, int dst_width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     // load even pixels into q0, odd into q1

     MEMACCESS(0)

@@ -773,7 +763,6 @@

 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,

                                   uint8* dst_argb, int dst_width) {

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -804,7 +793,6 @@

   asm volatile (

     // change the stride to row 2 pointer

     "add        %1, %1, %0                     \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

@@ -845,7 +833,6 @@

                                int src_stepx, uint8* dst_argb, int dst_width) {

   asm volatile (

     "mov        r12, %3, lsl #2                \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.32    {d0[0]}, [%0], r12             \n"

@@ -875,7 +862,6 @@

   asm volatile (

     "mov        r12, %4, lsl #2                \n"

     "add        %1, %1, %0                     \n"

-    ".p2align   2                              \n"

   "1:                                          \n"

     MEMACCESS(0)

     "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1

@@ -927,10 +913,9 @@

 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,

                         int dst_width, int x, int dx) {

-  int tmp = 0;

+  int tmp;

   const uint8* src_tmp = src_argb;

   asm volatile (

-    ".p2align   2                              \n"

   "1:                                          \n"

     LOAD1_DATA32_LANE(d0, 0)

     LOAD1_DATA32_LANE(d0, 1)

@@ -945,13 +930,13 @@

     "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels

     "subs       %2, %2, #8                     \n"  // 8 processed per loop

     "bgt        1b                             \n"

-  : "+r"(dst_argb),         // %0

-    "+r"(src_argb),         // %1

-    "+r"(dst_width),        // %2

-    "+r"(x),                // %3

-    "+r"(dx),               // %4

-    "+r"(tmp),              // %5

-    "+r"(src_tmp)           // %6

+  : "+r"(dst_argb),   // %0

+    "+r"(src_argb),   // %1

+    "+r"(dst_width),  // %2

+    "+r"(x),          // %3

+    "+r"(dx),         // %4

+    "=&r"(tmp),       // %5

+    "+r"(src_tmp)     // %6

   : "memory", "cc", "q0", "q1"

);

@@ -974,7 +959,6 @@

   int* tmp = dx_offset;

   const uint8* src_tmp = src_argb;

   asm volatile (

-    ".p2align   2                              \n"

     "vdup.32    q0, %3                         \n"  // x

     "vdup.32    q1, %4                         \n"  // dx

     "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3

--- a/third_party/libyuv/source/scale_neon64.cc

+++ b/third_party/libyuv/source/scale_neon64.cc

@@ -547,7 +547,7 @@

 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

                     uint16* dst_ptr, int src_width, int src_height) {

-  const uint8* src_tmp = NULL;

+  const uint8* src_tmp;

   asm volatile (

   "1:                                          \n"

     "mov       %0, %1                          \n"

@@ -567,12 +567,12 @@

     "add      %1, %1, #16                      \n"

     "subs     %w4, %w4, #16                    \n"  // 16 processed per loop

     "b.gt     1b                               \n"

-  : "+r"(src_tmp),          // %0

-    "+r"(src_ptr),          // %1

-    "+r"(dst_ptr),          // %2

-    "+r"(src_stride),       // %3

-    "+r"(src_width),        // %4

-    "+r"(src_height)        // %5

+  : "=&r"(src_tmp),    // %0

+    "+r"(src_ptr),     // %1

+    "+r"(dst_ptr),     // %2

+    "+r"(src_stride),  // %3

+    "+r"(src_width),   // %4

+    "+r"(src_height)   // %5

   : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List

);

@@ -931,7 +931,7 @@

   int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.

   int64 x64 = (int64) x;

   int64 dx64 = (int64) dx;

-  int64 tmp64 = 0;

+  int64 tmp64;

   asm volatile (

   "1:                                          \n"

     LOAD1_DATA32_LANE(v0, 0)

@@ -947,13 +947,13 @@

     "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels

     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

     "b.gt        1b                            \n"

-  : "+r"(dst_argb),         // %0

-    "+r"(src_argb),         // %1

-    "+r"(dst_width64),      // %2

-    "+r"(x64),              // %3

-    "+r"(dx64),             // %4

-    "+r"(tmp64),            // %5

-    "+r"(src_tmp)           // %6

+  : "+r"(dst_argb),     // %0

+    "+r"(src_argb),     // %1

+    "+r"(dst_width64),  // %2

+    "+r"(x64),          // %3

+    "+r"(dx64),         // %4

+    "=&r"(tmp64),       // %5

+    "+r"(src_tmp)       // %6

   : "memory", "cc", "v0", "v1"

);

--- a/third_party/libyuv/source/scale_win.cc

+++ b/third_party/libyuv/source/scale_win.cc

@@ -16,9 +16,8 @@

 extern "C" {

 #endif

-// This module is for Visual C x86.

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \

-    defined(_MSC_VER) && !defined(__clang__)

+// This module is for 32 bit Visual C x86 and clangcl

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

 // Offsets for source bytes 0 to 9

 static uvec8 kShuf0 =

@@ -96,8 +95,8 @@

 // Reads 32 pixels, throws half away and writes 16 pixels.

 __declspec(naked)

-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) {

+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                         uint8* dst_ptr, int dst_width) {

   __asm {

     mov        eax, [esp + 4]        // src_ptr

                                      // src_stride ignored

@@ -122,31 +121,28 @@

 // Blends 32x1 rectangle to 16x1.

 __declspec(naked)

-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst_ptr, int dst_width) {

+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width) {

   __asm {

     mov        eax, [esp + 4]        // src_ptr

                                      // src_stride

     mov        edx, [esp + 12]       // dst_ptr

     mov        ecx, [esp + 16]       // dst_width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

-    psrlw      xmm5, 8

+    pcmpeqb    xmm4, xmm4            // constant 0x0101

+    psrlw      xmm4, 15

+    packuswb   xmm4, xmm4

+    pxor       xmm5, xmm5            // constant 0

   wloop:

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

     lea        eax,  [eax + 32]

-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

-    psrlw      xmm0, 8

-    movdqa     xmm3, xmm1

-    psrlw      xmm1, 8

-    pand       xmm2, xmm5

-    pand       xmm3, xmm5

-    pavgw      xmm0, xmm2

-    pavgw      xmm1, xmm3

+    pmaddubsw  xmm0, xmm4      // horizontal add

+    pmaddubsw  xmm1, xmm4

+    pavgw      xmm0, xmm5      // (x + 1) / 2

+    pavgw      xmm1, xmm5

     packuswb   xmm0, xmm1

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

     sub        ecx, 16

@@ -158,8 +154,8 @@

 // Blends 32x2 rectangle to 16x1.

 __declspec(naked)

-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width) {

+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst_ptr, int dst_width) {

   __asm {

     push       esi

     mov        eax, [esp + 4 + 4]    // src_ptr

@@ -166,9 +162,12 @@

     mov        esi, [esp + 4 + 8]    // src_stride

     mov        edx, [esp + 4 + 12]   // dst_ptr

     mov        ecx, [esp + 4 + 16]   // dst_width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

-    psrlw      xmm5, 8

+    pcmpeqb    xmm4, xmm4            // constant 0x0101

+    psrlw      xmm4, 15

+    packuswb   xmm4, xmm4

+    pxor       xmm5, xmm5            // constant 0

   wloop:

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

@@ -175,19 +174,17 @@

     movdqu     xmm2, [eax + esi]

     movdqu     xmm3, [eax + esi + 16]

     lea        eax,  [eax + 32]

-    pavgb      xmm0, xmm2            // average rows

-    pavgb      xmm1, xmm3

-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

-    psrlw      xmm0, 8

-    movdqa     xmm3, xmm1

-    psrlw      xmm1, 8

-    pand       xmm2, xmm5

-    pand       xmm3, xmm5

-    pavgw      xmm0, xmm2

-    pavgw      xmm1, xmm3

+    pmaddubsw  xmm0, xmm4      // horizontal add

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    paddw      xmm0, xmm2      // vertical add

+    paddw      xmm1, xmm3

+    psrlw      xmm0, 1

+    psrlw      xmm1, 1

+    pavgw      xmm0, xmm5      // (x + 1) / 2

+    pavgw      xmm1, xmm5

     packuswb   xmm0, xmm1

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

     sub        ecx, 16

@@ -246,14 +243,12 @@

     vmovdqu     ymm0, [eax]

     vmovdqu     ymm1, [eax + 32]

     lea         eax,  [eax + 64]

-    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally

+    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add

     vpmaddubsw  ymm1, ymm1, ymm4

     vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2

     vpavgw      ymm1, ymm1, ymm5

     vpackuswb   ymm0, ymm0, ymm1

     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb

     vmovdqu     [edx], ymm0

     lea         edx, [edx + 32]

     sub         ecx, 32

@@ -264,6 +259,8 @@

+// For rounding, average = (sum + 2) / 4

+// becomes average((sum >> 1), 0)

 // Blends 64x2 rectangle to 32x1.

 __declspec(naked)

 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

@@ -281,19 +278,23 @@

     vpxor       ymm5, ymm5, ymm5      // constant 0

   wloop:

-    vmovdqu     ymm0, [eax]           // average rows

+    vmovdqu     ymm0, [eax]

     vmovdqu     ymm1, [eax + 32]

-    vpavgb      ymm0, ymm0, [eax + esi]

-    vpavgb      ymm1, ymm1, [eax + esi + 32]

+    vmovdqu     ymm2, [eax + esi]

+    vmovdqu     ymm3, [eax + esi + 32]

     lea         eax,  [eax + 64]

-    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally

+    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add

     vpmaddubsw  ymm1, ymm1, ymm4

+    vpmaddubsw  ymm2, ymm2, ymm4

+    vpmaddubsw  ymm3, ymm3, ymm4

+    vpaddw      ymm0, ymm0, ymm2      // vertical add

+    vpaddw      ymm1, ymm1, ymm3

+    vpsrlw      ymm0, ymm0, 1         // (x + 2) / 4 = (x / 2 + 1) / 2

+    vpsrlw      ymm1, ymm1, 1

     vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2

     vpavgw      ymm1, ymm1, ymm5

     vpackuswb   ymm0, ymm0, ymm1

     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb

     vmovdqu     [edx], ymm0

     lea         edx, [edx + 32]

     sub         ecx, 32

@@ -308,7 +309,7 @@

 // Point samples 32 pixels to 8 pixels.

 __declspec(naked)

-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

                         uint8* dst_ptr, int dst_width) {

   __asm {

     mov        eax, [esp + 4]        // src_ptr

@@ -339,7 +340,7 @@

 // Blends 32x4 rectangle to 8x1.

 __declspec(naked)

-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

                            uint8* dst_ptr, int dst_width) {

   __asm {

     push       esi

@@ -349,8 +350,11 @@

     mov        edx, [esp + 8 + 12]   // dst_ptr

     mov        ecx, [esp + 8 + 16]   // dst_width

     lea        edi, [esi + esi * 2]  // src_stride * 3

-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff

-    psrlw      xmm7, 8

+    pcmpeqb    xmm4, xmm4            // constant 0x0101

+    psrlw      xmm4, 15

+    movdqa     xmm5, xmm4

+    packuswb   xmm4, xmm4

+    psllw      xmm5, 3               // constant 0x0008

   wloop:

     movdqu     xmm0, [eax]           // average rows

@@ -357,34 +361,29 @@

     movdqu     xmm1, [eax + 16]

     movdqu     xmm2, [eax + esi]

     movdqu     xmm3, [eax + esi + 16]

-    pavgb      xmm0, xmm2

-    pavgb      xmm1, xmm3

+    pmaddubsw  xmm0, xmm4      // horizontal add

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    paddw      xmm0, xmm2      // vertical add rows 0, 1

+    paddw      xmm1, xmm3

     movdqu     xmm2, [eax + esi * 2]

     movdqu     xmm3, [eax + esi * 2 + 16]

-    movdqu     xmm4, [eax + edi]

-    movdqu     xmm5, [eax + edi + 16]

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    paddw      xmm0, xmm2      // add row 2

+    paddw      xmm1, xmm3

+    movdqu     xmm2, [eax + edi]

+    movdqu     xmm3, [eax + edi + 16]

     lea        eax, [eax + 32]

-    pavgb      xmm2, xmm4

-    pavgb      xmm3, xmm5

-    pavgb      xmm0, xmm2

-    pavgb      xmm1, xmm3

-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

-    psrlw      xmm0, 8

-    movdqa     xmm3, xmm1

-    psrlw      xmm1, 8

-    pand       xmm2, xmm7

-    pand       xmm3, xmm7

-    pavgw      xmm0, xmm2

-    pavgw      xmm1, xmm3

-    packuswb   xmm0, xmm1

-    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)

-    psrlw      xmm0, 8

-    pand       xmm2, xmm7

-    pavgw      xmm0, xmm2

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    paddw      xmm0, xmm2      // add row 3

+    paddw      xmm1, xmm3

+    phaddw     xmm0, xmm1

+    paddw      xmm0, xmm5      // + 8 for round

+    psrlw      xmm0, 4         // /16 for average of 4 * 4

     packuswb   xmm0, xmm0

     movq       qword ptr [edx], xmm0

     lea        edx, [edx + 8]

     sub        ecx, 8

@@ -443,37 +442,41 @@

     mov         edx, [esp + 8 + 12]   // dst_ptr

     mov         ecx, [esp + 8 + 16]   // dst_width

     lea         edi, [esi + esi * 2]  // src_stride * 3

-    vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff

-    vpsrlw      ymm7, ymm7, 8

+    vpcmpeqb    ymm4, ymm4, ymm4            // constant 0x0101

+    vpsrlw      ymm4, ymm4, 15

+    vpsllw      ymm5, ymm4, 3               // constant 0x0008

+    vpackuswb   ymm4, ymm4, ymm4

   wloop:

     vmovdqu     ymm0, [eax]           // average rows

     vmovdqu     ymm1, [eax + 32]

-    vpavgb      ymm0, ymm0, [eax + esi]

-    vpavgb      ymm1, ymm1, [eax + esi + 32]

+    vmovdqu     ymm2, [eax + esi]

+    vmovdqu     ymm3, [eax + esi + 32]

+    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add

+    vpmaddubsw  ymm1, ymm1, ymm4

+    vpmaddubsw  ymm2, ymm2, ymm4

+    vpmaddubsw  ymm3, ymm3, ymm4

+    vpaddw      ymm0, ymm0, ymm2      // vertical add rows 0, 1

+    vpaddw      ymm1, ymm1, ymm3

     vmovdqu     ymm2, [eax + esi * 2]

     vmovdqu     ymm3, [eax + esi * 2 + 32]

-    vpavgb      ymm2, ymm2, [eax + edi]

-    vpavgb      ymm3, ymm3, [eax + edi + 32]

-    lea         eax, [eax + 64]

-    vpavgb      ymm0, ymm0, ymm2

-    vpavgb      ymm1, ymm1, ymm3

-    vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)

-    vpand       ymm3, ymm1, ymm7

-    vpsrlw      ymm0, ymm0, 8

-    vpsrlw      ymm1, ymm1, 8

-    vpavgw      ymm0, ymm0, ymm2

-    vpavgw      ymm1, ymm1, ymm3

-    vpackuswb   ymm0, ymm0, ymm1

-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb

-    vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)

-    vpsrlw      ymm0, ymm0, 8

-    vpavgw      ymm0, ymm0, ymm2

+    vpmaddubsw  ymm2, ymm2, ymm4

+    vpmaddubsw  ymm3, ymm3, ymm4

+    vpaddw      ymm0, ymm0, ymm2      // add row 2

+    vpaddw      ymm1, ymm1, ymm3

+    vmovdqu     ymm2, [eax + edi]

+    vmovdqu     ymm3, [eax + edi + 32]

+    lea         eax,  [eax + 64]

+    vpmaddubsw  ymm2, ymm2, ymm4

+    vpmaddubsw  ymm3, ymm3, ymm4

+    vpaddw      ymm0, ymm0, ymm2      // add row 3

+    vpaddw      ymm1, ymm1, ymm3

+    vphaddw     ymm0, ymm0, ymm1      // mutates

+    vpermq      ymm0, ymm0, 0xd8      // unmutate vphaddw

+    vpaddw      ymm0, ymm0, ymm5      // + 8 for round

+    vpsrlw      ymm0, ymm0, 4         // /32 for average of 4 * 4

     vpackuswb   ymm0, ymm0, ymm0

     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb

     vmovdqu     [edx], xmm0

     lea         edx, [edx + 16]

     sub         ecx, 16

@@ -499,9 +502,9 @@

                                      // src_stride ignored

     mov        edx, [esp + 12]       // dst_ptr

     mov        ecx, [esp + 16]       // dst_width

-    movdqa     xmm3, kShuf0

-    movdqa     xmm4, kShuf1

-    movdqa     xmm5, kShuf2

+    movdqa     xmm3, xmmword ptr kShuf0

+    movdqa     xmm4, xmmword ptr kShuf1

+    movdqa     xmm5, xmmword ptr kShuf2

   wloop:

     movdqu     xmm0, [eax]

@@ -548,12 +551,12 @@

     mov        esi, [esp + 4 + 8]    // src_stride

     mov        edx, [esp + 4 + 12]   // dst_ptr

     mov        ecx, [esp + 4 + 16]   // dst_width

-    movdqa     xmm2, kShuf01

-    movdqa     xmm3, kShuf11

-    movdqa     xmm4, kShuf21

-    movdqa     xmm5, kMadd01

-    movdqa     xmm6, kMadd11

-    movdqa     xmm7, kRound34

+    movdqa     xmm2, xmmword ptr kShuf01

+    movdqa     xmm3, xmmword ptr kShuf11

+    movdqa     xmm4, xmmword ptr kShuf21

+    movdqa     xmm5, xmmword ptr kMadd01

+    movdqa     xmm6, xmmword ptr kMadd11

+    movdqa     xmm7, xmmword ptr kRound34

   wloop:

     movdqu     xmm0, [eax]           // pixels 0..7

@@ -579,7 +582,7 @@

     lea        eax, [eax + 32]

     pavgb      xmm0, xmm1

     pshufb     xmm0, xmm4

-    movdqa     xmm1, kMadd21

+    movdqa     xmm1, xmmword ptr kMadd21

     pmaddubsw  xmm0, xmm1

     paddsw     xmm0, xmm7

     psrlw      xmm0, 2

@@ -605,12 +608,12 @@

     mov        esi, [esp + 4 + 8]    // src_stride

     mov        edx, [esp + 4 + 12]   // dst_ptr

     mov        ecx, [esp + 4 + 16]   // dst_width

-    movdqa     xmm2, kShuf01

-    movdqa     xmm3, kShuf11

-    movdqa     xmm4, kShuf21

-    movdqa     xmm5, kMadd01

-    movdqa     xmm6, kMadd11

-    movdqa     xmm7, kRound34

+    movdqa     xmm2, xmmword ptr kShuf01

+    movdqa     xmm3, xmmword ptr kShuf11

+    movdqa     xmm4, xmmword ptr kShuf21

+    movdqa     xmm5, xmmword ptr kMadd01

+    movdqa     xmm6, xmmword ptr kMadd11

+    movdqa     xmm7, xmmword ptr kRound34

   wloop:

     movdqu     xmm0, [eax]           // pixels 0..7

@@ -639,7 +642,7 @@

     pavgb      xmm1, xmm0

     pavgb      xmm0, xmm1

     pshufb     xmm0, xmm4

-    movdqa     xmm1, kMadd21

+    movdqa     xmm1, xmmword ptr kMadd21

     pmaddubsw  xmm0, xmm1

     paddsw     xmm0, xmm7

     psrlw      xmm0, 2

@@ -665,8 +668,8 @@

                                      // src_stride ignored

     mov        edx, [esp + 12]       // dst_ptr

     mov        ecx, [esp + 16]       // dst_width

-    movdqa     xmm4, kShuf38a

-    movdqa     xmm5, kShuf38b

+    movdqa     xmm4, xmmword ptr kShuf38a

+    movdqa     xmm5, xmmword ptr kShuf38b

   xloop:

     movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5

@@ -698,9 +701,9 @@

     mov        esi, [esp + 4 + 8]    // src_stride

     mov        edx, [esp + 4 + 12]   // dst_ptr

     mov        ecx, [esp + 4 + 16]   // dst_width

-    movdqa     xmm2, kShufAc

-    movdqa     xmm3, kShufAc3

-    movdqa     xmm4, kScaleAc33

+    movdqa     xmm2, xmmword ptr kShufAc

+    movdqa     xmm3, xmmword ptr kShufAc3

+    movdqa     xmm4, xmmword ptr kScaleAc33

     pxor       xmm5, xmm5

   xloop:

@@ -763,10 +766,10 @@

     mov        esi, [esp + 4 + 8]    // src_stride

     mov        edx, [esp + 4 + 12]   // dst_ptr

     mov        ecx, [esp + 4 + 16]   // dst_width

-    movdqa     xmm2, kShufAb0

-    movdqa     xmm3, kShufAb1

-    movdqa     xmm4, kShufAb2

-    movdqa     xmm5, kScaleAb2

+    movdqa     xmm2, xmmword ptr kShufAb0

+    movdqa     xmm3, xmmword ptr kShufAb1

+    movdqa     xmm4, xmmword ptr kShufAb2

+    movdqa     xmm5, xmmword ptr kScaleAb2

   xloop:

     movdqu     xmm0, [eax]           // average 2 rows into xmm0

@@ -857,6 +860,16 @@

 #endif  // HAS_SCALEADDROW_AVX2

+// Constant for making pixels signed to avoid pmaddubsw

+// saturation.

+static uvec8 kFsub80 =

+  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };

+// Constant for making pixels unsigned and adding .5 for rounding.

+static uvec16 kFadd40 =

+  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };

 // Bilinear column filtering. SSSE3 version.

 __declspec(naked)

 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

@@ -874,6 +887,8 @@

     movd       xmm5, eax

     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.

     psrlw      xmm6, 9

+    pcmpeqb    xmm7, xmm7           // generate 0x0001

+    psrlw      xmm7, 15

     pextrw     eax, xmm2, 1         // get x0 integer. preroll

     sub        ecx, 2

     jl         xloop29

@@ -896,13 +911,16 @@

     movd       xmm4, ebx

     pshufb     xmm1, xmm5           // 0011

     punpcklwd  xmm0, xmm4

+    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.

     pxor       xmm1, xmm6           // 0..7f and 7f..0

-    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.

+    paddusb    xmm1, xmm7           // +1 so 0..7f and 80..1

+    pmaddubsw  xmm1, xmm0           // 16 bit, 2 pixels.

     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.

     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.

-    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.

-    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.

-    movd       ebx, xmm0

+    paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.

+    psrlw      xmm1, 7              // 8.7 fixed point to low 8 bits.

+    packuswb   xmm1, xmm1           // 8 bits, 2 pixels.

+    movd       ebx, xmm1

     mov        [edi], bx

     lea        edi, [edi + 2]

     sub        ecx, 2               // 2 pixels

@@ -909,7 +927,6 @@

     jge        xloop2

  xloop29:

     add        ecx, 2 - 1

     jl         xloop99

@@ -918,11 +935,14 @@

     movd       xmm0, ebx

     psrlw      xmm2, 9              // 7 bit fractions.

     pshufb     xmm2, xmm5           // 0011

+    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.

     pxor       xmm2, xmm6           // 0..7f and 7f..0

-    pmaddubsw  xmm0, xmm2           // 16 bit

-    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.

-    packuswb   xmm0, xmm0           // 8 bits

-    movd       ebx, xmm0

+    paddusb    xmm2, xmm7           // +1 so 0..7f and 80..1

+    pmaddubsw  xmm2, xmm0           // 16 bit

+    paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.

+    psrlw      xmm2, 7              // 8.7 fixed point to low 8 bits.

+    packuswb   xmm2, xmm2           // 8 bits

+    movd       ebx, xmm2

     mov        [edi], bl

  xloop99:

@@ -1233,8 +1253,8 @@

     mov        ecx, [esp + 8 + 12]   // dst_width

     movd       xmm2, [esp + 8 + 16]  // x

     movd       xmm3, [esp + 8 + 20]  // dx

-    movdqa     xmm4, kShuffleColARGB

-    movdqa     xmm5, kShuffleFractions

+    movdqa     xmm4, xmmword ptr kShuffleColARGB

+    movdqa     xmm5, xmmword ptr kShuffleFractions

     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.

     psrlw      xmm6, 9

     pextrw     eax, xmm2, 1         // get x0 integer. preroll

--- a/third_party/libyuv/source/video_common.cc

+++ b/third_party/libyuv/source/video_common.cc

@@ -25,6 +25,7 @@

 static const struct FourCCAliasEntry kFourCCAliases[] = {

   {FOURCC_IYUV, FOURCC_I420},

+  {FOURCC_YU12, FOURCC_I420},

   {FOURCC_YU16, FOURCC_I422},

   {FOURCC_YU24, FOURCC_I444},

   {FOURCC_YUYV, FOURCC_YUY2},

--- a/third_party/libyuv/source/x86inc.asm

+++ /dev/null

@@ -1,1136 +1,0 @@

-;*****************************************************************************

-;* x86inc.asm: x264asm abstraction layer

-;*****************************************************************************

-;* Copyright (C) 2005-2012 x264 project

-;*

-;* Authors: Loren Merritt <[email protected]>

-;*          Anton Mitrofanov <[email protected]>

-;*          Jason Garrett-Glaser <[email protected]>

-;*          Henrik Gramner <[email protected]>

-;*

-;* Permission to use, copy, modify, and/or distribute this software for any

-;* purpose with or without fee is hereby granted, provided that the above

-;* copyright notice and this permission notice appear in all copies.

-;*

-;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES

-;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF

-;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR

-;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

-;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN

-;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF

-;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

-;*****************************************************************************

-; This is a header file for the x264ASM assembly language, which uses

-; NASM/YASM syntax combined with a large number of macros to provide easy

-; abstraction between different calling conventions (x86_32, win64, linux64).

-; It also has various other useful features to simplify writing the kind of

-; DSP functions that are most often used in x264.

-; Unlike the rest of x264, this file is available under an ISC license, as it

-; has significant usefulness outside of x264 and we want it to be available

-; to the largest audience possible.  Of course, if you modify it for your own

-; purposes to add a new feature, we strongly encourage contributing a patch

-; as this feature might be useful for others as well.  Send patches or ideas

-; to [email protected] .

-; Local changes for libyuv:

-; remove %define program_name and references in labels

-; rename cpus to uppercase

-%define WIN64  0

-%define UNIX64 0

-%if ARCH_X86_64

-    %ifidn __OUTPUT_FORMAT__,win32

-        %define WIN64  1

-    %elifidn __OUTPUT_FORMAT__,win64

-        %define WIN64  1

-    %else

-        %define UNIX64 1

-    %endif

-%endif

-%ifdef PREFIX

-    %define mangle(x) _ %+ x

-%else

-    %define mangle(x) x

-%endif

-; Name of the .rodata section.

-; Kludge: Something on OS X fails to align .rodata even given an align attribute,

-; so use a different read-only section.

-%macro SECTION_RODATA 0-1 16

-    %ifidn __OUTPUT_FORMAT__,macho64

-        SECTION .text align=%1

-    %elifidn __OUTPUT_FORMAT__,macho

-        SECTION .text align=%1

-        fakegot:

-    %elifidn __OUTPUT_FORMAT__,aout

-        section .text

-    %else

-        SECTION .rodata align=%1

-    %endif

-%endmacro

-; aout does not support align=

-%macro SECTION_TEXT 0-1 16

-    %ifidn __OUTPUT_FORMAT__,aout

-        SECTION .text

-    %else

-        SECTION .text align=%1

-    %endif

-%endmacro

-%if WIN64

-    %define PIC

-%elif ARCH_X86_64 == 0

-; x86_32 doesn't require PIC.

-; Some distros prefer shared objects to be PIC, but nothing breaks if

-; the code contains a few textrels, so we'll skip that complexity.

-    %undef PIC

-%endif

-%ifdef PIC

-    default rel

-%endif

-; Always use long nops (reduces 0x90 spam in disassembly on x86_32)

-CPU amdnop

-; Macros to eliminate most code duplication between x86_32 and x86_64:

-; Currently this works only for leaf functions which load all their arguments

-; into registers at the start, and make no other use of the stack. Luckily that

-; covers most of x264's asm.

-; PROLOGUE:

-; %1 = number of arguments. loads them from stack if needed.

-; %2 = number of registers used. pushes callee-saved regs if needed.

-; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.

-; %4 = list of names to define to registers

-; PROLOGUE can also be invoked by adding the same options to cglobal

-; e.g.

-; cglobal foo, 2,3,0, dst, src, tmp

-; declares a function (foo), taking two args (dst and src) and one local variable (tmp)

-; TODO Some functions can use some args directly from the stack. If they're the

-; last args then you can just not declare them, but if they're in the middle

-; we need more flexible macro.

-; RET:

-; Pops anything that was pushed by PROLOGUE, and returns.

-; REP_RET:

-; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons

-; which are slow when a normal ret follows a branch.

-; registers:

-; rN and rNq are the native-size register holding function argument N

-; rNd, rNw, rNb are dword, word, and byte size

-; rNh is the high 8 bits of the word size

-; rNm is the original location of arg N (a register or on the stack), dword

-; rNmp is native size

-%macro DECLARE_REG 2-3

-    %define r%1q %2

-    %define r%1d %2d

-    %define r%1w %2w

-    %define r%1b %2b

-    %define r%1h %2h

-    %if %0 == 2

-        %define r%1m  %2d

-        %define r%1mp %2

-    %elif ARCH_X86_64 ; memory

-        %define r%1m [rsp + stack_offset + %3]

-        %define r%1mp qword r %+ %1m

-    %else

-        %define r%1m [esp + stack_offset + %3]

-        %define r%1mp dword r %+ %1m

-    %endif

-    %define r%1  %2

-%endmacro

-%macro DECLARE_REG_SIZE 3

-    %define r%1q r%1

-    %define e%1q r%1

-    %define r%1d e%1

-    %define e%1d e%1

-    %define r%1w %1

-    %define e%1w %1

-    %define r%1h %3

-    %define e%1h %3

-    %define r%1b %2

-    %define e%1b %2

-%if ARCH_X86_64 == 0

-    %define r%1  e%1

-%endif

-%endmacro

-DECLARE_REG_SIZE ax, al, ah

-DECLARE_REG_SIZE bx, bl, bh

-DECLARE_REG_SIZE cx, cl, ch

-DECLARE_REG_SIZE dx, dl, dh

-DECLARE_REG_SIZE si, sil, null

-DECLARE_REG_SIZE di, dil, null

-DECLARE_REG_SIZE bp, bpl, null

-; t# defines for when per-arch register allocation is more complex than just function arguments

-%macro DECLARE_REG_TMP 1-*

-    %assign %%i 0

-    %rep %0

-        CAT_XDEFINE t, %%i, r%1

-        %assign %%i %%i+1

-        %rotate 1

-    %endrep

-%endmacro

-%macro DECLARE_REG_TMP_SIZE 0-*

-    %rep %0

-        %define t%1q t%1 %+ q

-        %define t%1d t%1 %+ d

-        %define t%1w t%1 %+ w

-        %define t%1h t%1 %+ h

-        %define t%1b t%1 %+ b

-        %rotate 1

-    %endrep

-%endmacro

-DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14

-%if ARCH_X86_64

-    %define gprsize 8

-%else

-    %define gprsize 4

-%endif

-%macro PUSH 1

-    push %1

-    %assign stack_offset stack_offset+gprsize

-%endmacro

-%macro POP 1

-    pop %1

-    %assign stack_offset stack_offset-gprsize

-%endmacro

-%macro PUSH_IF_USED 1-*

-    %rep %0

-        %if %1 < regs_used

-            PUSH r%1

-        %endif

-        %rotate 1

-    %endrep

-%endmacro

-%macro POP_IF_USED 1-*

-    %rep %0

-        %if %1 < regs_used

-            pop r%1

-        %endif

-        %rotate 1

-    %endrep

-%endmacro

-%macro LOAD_IF_USED 1-*

-    %rep %0

-        %if %1 < num_args

-            mov r%1, r %+ %1 %+ mp

-        %endif

-        %rotate 1

-    %endrep

-%endmacro

-%macro SUB 2

-    sub %1, %2

-    %ifidn %1, rsp

-        %assign stack_offset stack_offset+(%2)

-    %endif

-%endmacro

-%macro ADD 2

-    add %1, %2

-    %ifidn %1, rsp

-        %assign stack_offset stack_offset-(%2)

-    %endif

-%endmacro

-%macro movifnidn 2

-    %ifnidn %1, %2

-        mov %1, %2

-    %endif

-%endmacro

-%macro movsxdifnidn 2

-    %ifnidn %1, %2

-        movsxd %1, %2

-    %endif

-%endmacro

-%macro ASSERT 1

-    %if (%1) == 0

-        %error assert failed

-    %endif

-%endmacro

-%macro DEFINE_ARGS 0-*

-    %ifdef n_arg_names

-        %assign %%i 0

-        %rep n_arg_names

-            CAT_UNDEF arg_name %+ %%i, q

-            CAT_UNDEF arg_name %+ %%i, d

-            CAT_UNDEF arg_name %+ %%i, w

-            CAT_UNDEF arg_name %+ %%i, h

-            CAT_UNDEF arg_name %+ %%i, b

-            CAT_UNDEF arg_name %+ %%i, m

-            CAT_UNDEF arg_name %+ %%i, mp

-            CAT_UNDEF arg_name, %%i

-            %assign %%i %%i+1

-        %endrep

-    %endif

-    %xdefine %%stack_offset stack_offset

-    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine

-    %assign %%i 0

-    %rep %0

-        %xdefine %1q r %+ %%i %+ q

-        %xdefine %1d r %+ %%i %+ d

-        %xdefine %1w r %+ %%i %+ w

-        %xdefine %1h r %+ %%i %+ h

-        %xdefine %1b r %+ %%i %+ b

-        %xdefine %1m r %+ %%i %+ m

-        %xdefine %1mp r %+ %%i %+ mp

-        CAT_XDEFINE arg_name, %%i, %1

-        %assign %%i %%i+1

-        %rotate 1

-    %endrep

-    %xdefine stack_offset %%stack_offset

-    %assign n_arg_names %0

-%endmacro

-%if WIN64 ; Windows x64 ;=================================================

-DECLARE_REG 0,  rcx

-DECLARE_REG 1,  rdx

-DECLARE_REG 2,  R8

-DECLARE_REG 3,  R9

-DECLARE_REG 4,  R10, 40

-DECLARE_REG 5,  R11, 48

-DECLARE_REG 6,  rax, 56

-DECLARE_REG 7,  rdi, 64

-DECLARE_REG 8,  rsi, 72

-DECLARE_REG 9,  rbx, 80

-DECLARE_REG 10, rbp, 88

-DECLARE_REG 11, R12, 96

-DECLARE_REG 12, R13, 104

-DECLARE_REG 13, R14, 112

-DECLARE_REG 14, R15, 120

-%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...

-    %assign num_args %1

-    %assign regs_used %2

-    ASSERT regs_used >= num_args

-    ASSERT regs_used <= 15

-    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14

-    %if mmsize == 8

-        %assign xmm_regs_used 0

-    %else

-        WIN64_SPILL_XMM %3

-    %endif

-    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14

-    DEFINE_ARGS %4

-%endmacro

-%macro WIN64_SPILL_XMM 1

-    %assign xmm_regs_used %1

-    ASSERT xmm_regs_used <= 16

-    %if xmm_regs_used > 6

-        SUB rsp, (xmm_regs_used-6)*16+16

-        %assign %%i xmm_regs_used

-        %rep (xmm_regs_used-6)

-            %assign %%i %%i-1

-            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i

-        %endrep

-    %endif

-%endmacro

-%macro WIN64_RESTORE_XMM_INTERNAL 1

-    %if xmm_regs_used > 6

-        %assign %%i xmm_regs_used

-        %rep (xmm_regs_used-6)

-            %assign %%i %%i-1

-            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]

-        %endrep

-        add %1, (xmm_regs_used-6)*16+16

-    %endif

-%endmacro

-%macro WIN64_RESTORE_XMM 1

-    WIN64_RESTORE_XMM_INTERNAL %1

-    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16

-    %assign xmm_regs_used 0

-%endmacro

-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32

-%macro RET 0

-    WIN64_RESTORE_XMM_INTERNAL rsp

-    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7

-%if mmsize == 32

-    vzeroupper

-%endif

-    ret

-%endmacro

-%elif ARCH_X86_64 ; *nix x64 ;=============================================

-DECLARE_REG 0,  rdi

-DECLARE_REG 1,  rsi

-DECLARE_REG 2,  rdx

-DECLARE_REG 3,  rcx

-DECLARE_REG 4,  R8

-DECLARE_REG 5,  R9

-DECLARE_REG 6,  rax, 8

-DECLARE_REG 7,  R10, 16

-DECLARE_REG 8,  R11, 24

-DECLARE_REG 9,  rbx, 32

-DECLARE_REG 10, rbp, 40

-DECLARE_REG 11, R12, 48

-DECLARE_REG 12, R13, 56

-DECLARE_REG 13, R14, 64

-DECLARE_REG 14, R15, 72

-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...

-    %assign num_args %1

-    %assign regs_used %2

-    ASSERT regs_used >= num_args

-    ASSERT regs_used <= 15

-    PUSH_IF_USED 9, 10, 11, 12, 13, 14

-    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14

-    DEFINE_ARGS %4

-%endmacro

-%define has_epilogue regs_used > 9 || mmsize == 32

-%macro RET 0

-    POP_IF_USED 14, 13, 12, 11, 10, 9

-%if mmsize == 32

-    vzeroupper

-%endif

-    ret

-%endmacro

-%else ; X86_32 ;==============================================================

-DECLARE_REG 0, eax, 4

-DECLARE_REG 1, ecx, 8

-DECLARE_REG 2, edx, 12

-DECLARE_REG 3, ebx, 16

-DECLARE_REG 4, esi, 20

-DECLARE_REG 5, edi, 24

-DECLARE_REG 6, ebp, 28

-%define rsp esp

-%macro DECLARE_ARG 1-*

-    %rep %0

-        %define r%1m [esp + stack_offset + 4*%1 + 4]

-        %define r%1mp dword r%1m

-        %rotate 1

-    %endrep

-%endmacro

-DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14

-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...

-    %assign num_args %1

-    %assign regs_used %2

-    %if regs_used > 7

-        %assign regs_used 7

-    %endif

-    ASSERT regs_used >= num_args

-    PUSH_IF_USED 3, 4, 5, 6

-    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6

-    DEFINE_ARGS %4

-%endmacro

-%define has_epilogue regs_used > 3 || mmsize == 32

-%macro RET 0

-    POP_IF_USED 6, 5, 4, 3

-%if mmsize == 32

-    vzeroupper

-%endif

-    ret

-%endmacro

-%endif ;======================================================================

-%if WIN64 == 0

-%macro WIN64_SPILL_XMM 1

-%endmacro

-%macro WIN64_RESTORE_XMM 1

-%endmacro

-%endif

-%macro REP_RET 0

-    %if has_epilogue

-        RET

-    %else

-        rep ret

-    %endif

-%endmacro

-%macro TAIL_CALL 2 ; callee, is_nonadjacent

-    %if has_epilogue

-        call %1

-        RET

-    %elif %2

-        jmp %1

-    %endif

-%endmacro

-;=============================================================================

-; arch-independent part

-;=============================================================================

-%assign function_align 16

-; Begin a function.

-; Applies any symbol mangling needed for C linkage, and sets up a define such that

-; subsequent uses of the function name automatically refer to the mangled version.

-; Appends cpuflags to the function name if cpuflags has been specified.

-%macro cglobal 1-2+ ; name, [PROLOGUE args]

-%if %0 == 1

-    cglobal_internal %1 %+ SUFFIX

-%else

-    cglobal_internal %1 %+ SUFFIX, %2

-%endif

-%endmacro

-%macro cglobal_internal 1-2+

-    %ifndef cglobaled_%1

-        %xdefine %1 mangle(%1)

-        %xdefine %1.skip_prologue %1 %+ .skip_prologue

-        CAT_XDEFINE cglobaled_, %1, 1

-    %endif

-    %xdefine current_function %1

-    %ifidn __OUTPUT_FORMAT__,elf

-        global %1:function hidden

-    %else

-        global %1

-    %endif

-    align function_align

-    %1:

-    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer

-    %assign stack_offset 0

-    %if %0 > 1

-        PROLOGUE %2

-    %endif

-%endmacro

-%macro cextern 1

-    %xdefine %1 mangle(%1)

-    CAT_XDEFINE cglobaled_, %1, 1

-    extern %1

-%endmacro

-; like cextern, but without the prefix

-%macro cextern_naked 1

-    %xdefine %1 mangle(%1)

-    CAT_XDEFINE cglobaled_, %1, 1

-    extern %1

-%endmacro

-%macro const 2+

-    %xdefine %1 mangle(%1)

-    global %1

-    %1: %2

-%endmacro

-; This is needed for ELF, otherwise the GNU linker assumes the stack is

-; executable by default.

-%ifidn __OUTPUT_FORMAT__,elf

-SECTION .note.GNU-stack noalloc noexec nowrite progbits

-%endif

-%ifidn __OUTPUT_FORMAT__,elf32

-section .note.GNU-stack noalloc noexec nowrite progbits

-%endif

-%ifidn __OUTPUT_FORMAT__,elf64

-section .note.GNU-stack noalloc noexec nowrite progbits

-%endif

-; cpuflags

-%assign cpuflags_MMX      (1<<0)

-%assign cpuflags_MMX2     (1<<1) | cpuflags_MMX

-%assign cpuflags_3dnow    (1<<2) | cpuflags_MMX

-%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow

-%assign cpuflags_SSE      (1<<4) | cpuflags_MMX2

-%assign cpuflags_SSE2     (1<<5) | cpuflags_SSE

-%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2

-%assign cpuflags_SSE3     (1<<7) | cpuflags_SSE2

-%assign cpuflags_SSSE3    (1<<8) | cpuflags_SSE3

-%assign cpuflags_SSE4     (1<<9) | cpuflags_SSSE3

-%assign cpuflags_SSE42    (1<<10)| cpuflags_SSE4

-%assign cpuflags_AVX      (1<<11)| cpuflags_SSE42

-%assign cpuflags_xop      (1<<12)| cpuflags_AVX

-%assign cpuflags_fma4     (1<<13)| cpuflags_AVX

-%assign cpuflags_AVX2     (1<<14)| cpuflags_AVX

-%assign cpuflags_fma3     (1<<15)| cpuflags_AVX

-%assign cpuflags_cache32  (1<<16)

-%assign cpuflags_cache64  (1<<17)

-%assign cpuflags_slowctz  (1<<18)

-%assign cpuflags_lzcnt    (1<<19)

-%assign cpuflags_misalign (1<<20)

-%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant

-%assign cpuflags_atom     (1<<22)

-%assign cpuflags_bmi1     (1<<23)

-%assign cpuflags_bmi2     (1<<24)|cpuflags_bmi1

-%assign cpuflags_tbm      (1<<25)|cpuflags_bmi1

-%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))

-%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))

-; Takes up to 2 cpuflags from the above list.

-; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.

-; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.

-%macro INIT_CPUFLAGS 0-2

-    %if %0 >= 1

-        %xdefine cpuname %1

-        %assign cpuflags cpuflags_%1

-        %if %0 >= 2

-            %xdefine cpuname %1_%2

-            %assign cpuflags cpuflags | cpuflags_%2

-        %endif

-        %xdefine SUFFIX _ %+ cpuname

-        %if cpuflag(AVX)

-            %assign AVX_enabled 1

-        %endif

-        %if mmsize == 16 && notcpuflag(SSE2)

-            %define mova movaps

-            %define movu movups

-            %define movnta movntps

-        %endif

-        %if cpuflag(aligned)

-            %define movu mova

-        %elifidn %1, SSE3

-            %define movu lddqu

-        %endif

-    %else

-        %xdefine SUFFIX

-        %undef cpuname

-        %undef cpuflags

-    %endif

-%endmacro

-; merge MMX and SSE*

-%macro CAT_XDEFINE 3

-    %xdefine %1%2 %3

-%endmacro

-%macro CAT_UNDEF 2

-    %undef %1%2

-%endmacro

-%macro INIT_MMX 0-1+

-    %assign AVX_enabled 0

-    %define RESET_MM_PERMUTATION INIT_MMX %1

-    %define mmsize 8

-    %define num_mmregs 8

-    %define mova movq

-    %define movu movq

-    %define movh movd

-    %define movnta movntq

-    %assign %%i 0

-    %rep 8

-    CAT_XDEFINE m, %%i, mm %+ %%i

-    CAT_XDEFINE nmm, %%i, %%i

-    %assign %%i %%i+1

-    %endrep

-    %rep 8

-    CAT_UNDEF m, %%i

-    CAT_UNDEF nmm, %%i

-    %assign %%i %%i+1

-    %endrep

-    INIT_CPUFLAGS %1

-%endmacro

-%macro INIT_XMM 0-1+

-    %assign AVX_enabled 0

-    %define RESET_MM_PERMUTATION INIT_XMM %1

-    %define mmsize 16

-    %define num_mmregs 8

-    %if ARCH_X86_64

-    %define num_mmregs 16

-    %endif

-    %define mova movdqa

-    %define movu movdqu

-    %define movh movq

-    %define movnta movntdq

-    %assign %%i 0

-    %rep num_mmregs

-    CAT_XDEFINE m, %%i, xmm %+ %%i

-    CAT_XDEFINE nxmm, %%i, %%i

-    %assign %%i %%i+1

-    %endrep

-    INIT_CPUFLAGS %1

-%endmacro

-%macro INIT_YMM 0-1+

-    %assign AVX_enabled 1

-    %define RESET_MM_PERMUTATION INIT_YMM %1

-    %define mmsize 32

-    %define num_mmregs 8

-    %if ARCH_X86_64

-    %define num_mmregs 16

-    %endif

-    %define mova vmovaps

-    %define movu vmovups

-    %undef movh

-    %define movnta vmovntps

-    %assign %%i 0

-    %rep num_mmregs

-    CAT_XDEFINE m, %%i, ymm %+ %%i

-    CAT_XDEFINE nymm, %%i, %%i

-    %assign %%i %%i+1

-    %endrep

-    INIT_CPUFLAGS %1

-%endmacro

-INIT_XMM

-; I often want to use macros that permute their arguments. e.g. there's no

-; efficient way to implement butterfly or transpose or dct without swapping some

-; arguments.

-;

-; I would like to not have to manually keep track of the permutations:

-; If I insert a permutation in the middle of a function, it should automatically

-; change everything that follows. For more complex macros I may also have multiple

-; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.

-;

-; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that

-; permutes its arguments. It's equivalent to exchanging the contents of the

-; registers, except that this way you exchange the register names instead, so it

-; doesn't cost any cycles.

-%macro PERMUTE 2-* ; takes a list of pairs to swap

-%rep %0/2

-    %xdefine tmp%2 m%2

-    %xdefine ntmp%2 nm%2

-    %rotate 2

-%endrep

-%rep %0/2

-    %xdefine m%1 tmp%2

-    %xdefine nm%1 ntmp%2

-    %undef tmp%2

-    %undef ntmp%2

-    %rotate 2

-%endrep

-%endmacro

-%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)

-%rep %0-1

-%ifdef m%1

-    %xdefine tmp m%1

-    %xdefine m%1 m%2

-    %xdefine m%2 tmp

-    CAT_XDEFINE n, m%1, %1

-    CAT_XDEFINE n, m%2, %2

-%else

-    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.

-    ; Be careful using this mode in nested macros though, as in some cases there may be

-    ; other copies of m# that have already been dereferenced and don't get updated correctly.

-    %xdefine %%n1 n %+ %1

-    %xdefine %%n2 n %+ %2

-    %xdefine tmp m %+ %%n1

-    CAT_XDEFINE m, %%n1, m %+ %%n2

-    CAT_XDEFINE m, %%n2, tmp

-    CAT_XDEFINE n, m %+ %%n1, %%n1

-    CAT_XDEFINE n, m %+ %%n2, %%n2

-%endif

-    %undef tmp

-    %rotate 1

-%endrep

-%endmacro

-; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later

-; calls to that function will automatically load the permutation, so values can

-; be returned in mmregs.

-%macro SAVE_MM_PERMUTATION 0-1

-    %if %0

-        %xdefine %%f %1_m

-    %else

-        %xdefine %%f current_function %+ _m

-    %endif

-    %assign %%i 0

-    %rep num_mmregs

-        CAT_XDEFINE %%f, %%i, m %+ %%i

-    %assign %%i %%i+1

-    %endrep

-%endmacro

-%macro LOAD_MM_PERMUTATION 1 ; name to load from

-    %ifdef %1_m0

-        %assign %%i 0

-        %rep num_mmregs

-            CAT_XDEFINE m, %%i, %1_m %+ %%i

-            CAT_XDEFINE n, m %+ %%i, %%i

-        %assign %%i %%i+1

-        %endrep

-    %endif

-%endmacro

-; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't

-%macro call 1

-    call_internal %1, %1 %+ SUFFIX

-%endmacro

-%macro call_internal 2

-    %xdefine %%i %1

-    %ifndef cglobaled_%1

-        %ifdef cglobaled_%2

-            %xdefine %%i %2

-        %endif

-    %endif

-    call %%i

-    LOAD_MM_PERMUTATION %%i

-%endmacro

-; Substitutions that reduce instruction size but are functionally equivalent

-%macro add 2

-    %ifnum %2

-        %if %2==128

-            sub %1, -128

-        %else

-            add %1, %2

-        %endif

-    %else

-        add %1, %2

-    %endif

-%endmacro

-%macro sub 2

-    %ifnum %2

-        %if %2==128

-            add %1, -128

-        %else

-            sub %1, %2

-        %endif

-    %else

-        sub %1, %2

-    %endif

-%endmacro

-;=============================================================================

-; AVX abstraction layer

-;=============================================================================

-%assign i 0

-%rep 16

-    %if i < 8

-        CAT_XDEFINE sizeofmm, i, 8

-    %endif

-    CAT_XDEFINE sizeofxmm, i, 16

-    CAT_XDEFINE sizeofymm, i, 32

-%assign i i+1

-%endrep

-%undef i

-%macro CHECK_AVX_INSTR_EMU 3-*

-    %xdefine %%opcode %1

-    %xdefine %%dst %2

-    %rep %0-2

-        %ifidn %%dst, %3

-            %error non-AVX emulation of ``%%opcode'' is not supported

-        %endif

-        %rotate 1

-    %endrep

-%endmacro

-;%1 == instruction

-;%2 == 1 if float, 0 if int

-;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)

-;%4 == number of operands given

-;%5+: operands

-%macro RUN_AVX_INSTR 6-7+

-    %ifid %6

-        %define %%sizeofreg sizeof%6

-    %elifid %5

-        %define %%sizeofreg sizeof%5

-    %else

-        %define %%sizeofreg mmsize

-    %endif

-    %if %%sizeofreg==32

-        %if %4>=3

-            v%1 %5, %6, %7

-        %else

-            v%1 %5, %6

-        %endif

-    %else

-        %if %%sizeofreg==8

-            %define %%regmov movq

-        %elif %2

-            %define %%regmov movaps

-        %else

-            %define %%regmov movdqa

-        %endif

-        %if %4>=3+%3

-            %ifnidn %5, %6

-                %if AVX_enabled && %%sizeofreg==16

-                    v%1 %5, %6, %7

-                %else

-                    CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7

-                    %%regmov %5, %6

-                    %1 %5, %7

-                %endif

-            %else

-                %1 %5, %7

-            %endif

-        %elif %4>=3

-            %1 %5, %6, %7

-        %else

-            %1 %5, %6

-        %endif

-    %endif

-%endmacro

-; 3arg AVX ops with a memory arg can only have it in src2,

-; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).

-; So, if the op is symmetric and the wrong one is memory, swap them.

-%macro RUN_AVX_INSTR1 8

-    %assign %%swap 0

-    %if AVX_enabled

-        %ifnid %6

-            %assign %%swap 1

-        %endif

-    %elifnidn %5, %6

-        %ifnid %7

-            %assign %%swap 1

-        %endif

-    %endif

-    %if %%swap && %3 == 0 && %8 == 1

-        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6

-    %else

-        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7

-    %endif

-%endmacro

-;%1 == instruction

-;%2 == 1 if float, 0 if int

-;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)

-;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not

-%macro AVX_INSTR 4

-    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4

-        %ifidn %3, fnord

-            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2

-        %elifidn %4, fnord

-            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9

-        %elifidn %5, fnord

-            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4

-        %else

-            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5

-        %endif

-    %endmacro

-%endmacro

-AVX_INSTR addpd, 1, 0, 1

-AVX_INSTR addps, 1, 0, 1

-AVX_INSTR addsd, 1, 0, 1

-AVX_INSTR addss, 1, 0, 1

-AVX_INSTR addsubpd, 1, 0, 0

-AVX_INSTR addsubps, 1, 0, 0

-AVX_INSTR andpd, 1, 0, 1

-AVX_INSTR andps, 1, 0, 1

-AVX_INSTR andnpd, 1, 0, 0

-AVX_INSTR andnps, 1, 0, 0

-AVX_INSTR blendpd, 1, 0, 0

-AVX_INSTR blendps, 1, 0, 0

-AVX_INSTR blendvpd, 1, 0, 0

-AVX_INSTR blendvps, 1, 0, 0

-AVX_INSTR cmppd, 1, 0, 0

-AVX_INSTR cmpps, 1, 0, 0

-AVX_INSTR cmpsd, 1, 0, 0

-AVX_INSTR cmpss, 1, 0, 0

-AVX_INSTR cvtdq2ps, 1, 0, 0

-AVX_INSTR cvtps2dq, 1, 0, 0

-AVX_INSTR divpd, 1, 0, 0

-AVX_INSTR divps, 1, 0, 0

-AVX_INSTR divsd, 1, 0, 0

-AVX_INSTR divss, 1, 0, 0

-AVX_INSTR dppd, 1, 1, 0

-AVX_INSTR dpps, 1, 1, 0

-AVX_INSTR haddpd, 1, 0, 0

-AVX_INSTR haddps, 1, 0, 0

-AVX_INSTR hsubpd, 1, 0, 0

-AVX_INSTR hsubps, 1, 0, 0

-AVX_INSTR maxpd, 1, 0, 1

-AVX_INSTR maxps, 1, 0, 1

-AVX_INSTR maxsd, 1, 0, 1

-AVX_INSTR maxss, 1, 0, 1

-AVX_INSTR minpd, 1, 0, 1

-AVX_INSTR minps, 1, 0, 1

-AVX_INSTR minsd, 1, 0, 1

-AVX_INSTR minss, 1, 0, 1

-AVX_INSTR movhlps, 1, 0, 0

-AVX_INSTR movlhps, 1, 0, 0

-AVX_INSTR movsd, 1, 0, 0

-AVX_INSTR movss, 1, 0, 0

-AVX_INSTR mpsadbw, 0, 1, 0

-AVX_INSTR mulpd, 1, 0, 1

-AVX_INSTR mulps, 1, 0, 1

-AVX_INSTR mulsd, 1, 0, 1

-AVX_INSTR mulss, 1, 0, 1

-AVX_INSTR orpd, 1, 0, 1

-AVX_INSTR orps, 1, 0, 1

-AVX_INSTR pabsb, 0, 0, 0

-AVX_INSTR pabsw, 0, 0, 0

-AVX_INSTR pabsd, 0, 0, 0

-AVX_INSTR packsswb, 0, 0, 0

-AVX_INSTR packssdw, 0, 0, 0

-AVX_INSTR packuswb, 0, 0, 0

-AVX_INSTR packusdw, 0, 0, 0

-AVX_INSTR paddb, 0, 0, 1

-AVX_INSTR paddw, 0, 0, 1

-AVX_INSTR paddd, 0, 0, 1

-AVX_INSTR paddq, 0, 0, 1

-AVX_INSTR paddsb, 0, 0, 1

-AVX_INSTR paddsw, 0, 0, 1

-AVX_INSTR paddusb, 0, 0, 1

-AVX_INSTR paddusw, 0, 0, 1

-AVX_INSTR palignr, 0, 1, 0

-AVX_INSTR pand, 0, 0, 1

-AVX_INSTR pandn, 0, 0, 0

-AVX_INSTR pavgb, 0, 0, 1

-AVX_INSTR pavgw, 0, 0, 1

-AVX_INSTR pblendvb, 0, 0, 0

-AVX_INSTR pblendw, 0, 1, 0

-AVX_INSTR pcmpestri, 0, 0, 0

-AVX_INSTR pcmpestrm, 0, 0, 0

-AVX_INSTR pcmpistri, 0, 0, 0

-AVX_INSTR pcmpistrm, 0, 0, 0

-AVX_INSTR pcmpeqb, 0, 0, 1

-AVX_INSTR pcmpeqw, 0, 0, 1

-AVX_INSTR pcmpeqd, 0, 0, 1

-AVX_INSTR pcmpeqq, 0, 0, 1

-AVX_INSTR pcmpgtb, 0, 0, 0

-AVX_INSTR pcmpgtw, 0, 0, 0

-AVX_INSTR pcmpgtd, 0, 0, 0

-AVX_INSTR pcmpgtq, 0, 0, 0

-AVX_INSTR phaddw, 0, 0, 0

-AVX_INSTR phaddd, 0, 0, 0

-AVX_INSTR phaddsw, 0, 0, 0

-AVX_INSTR phsubw, 0, 0, 0

-AVX_INSTR phsubd, 0, 0, 0

-AVX_INSTR phsubsw, 0, 0, 0

-AVX_INSTR pmaddwd, 0, 0, 1

-AVX_INSTR pmaddubsw, 0, 0, 0

-AVX_INSTR pmaxsb, 0, 0, 1

-AVX_INSTR pmaxsw, 0, 0, 1

-AVX_INSTR pmaxsd, 0, 0, 1

-AVX_INSTR pmaxub, 0, 0, 1

-AVX_INSTR pmaxuw, 0, 0, 1

-AVX_INSTR pmaxud, 0, 0, 1

-AVX_INSTR pminsb, 0, 0, 1

-AVX_INSTR pminsw, 0, 0, 1

-AVX_INSTR pminsd, 0, 0, 1

-AVX_INSTR pminub, 0, 0, 1

-AVX_INSTR pminuw, 0, 0, 1

-AVX_INSTR pminud, 0, 0, 1

-AVX_INSTR pmovmskb, 0, 0, 0

-AVX_INSTR pmulhuw, 0, 0, 1

-AVX_INSTR pmulhrsw, 0, 0, 1

-AVX_INSTR pmulhw, 0, 0, 1

-AVX_INSTR pmullw, 0, 0, 1

-AVX_INSTR pmulld, 0, 0, 1

-AVX_INSTR pmuludq, 0, 0, 1

-AVX_INSTR pmuldq, 0, 0, 1

-AVX_INSTR por, 0, 0, 1

-AVX_INSTR psadbw, 0, 0, 1

-AVX_INSTR pshufb, 0, 0, 0

-AVX_INSTR pshufd, 0, 1, 0

-AVX_INSTR pshufhw, 0, 1, 0

-AVX_INSTR pshuflw, 0, 1, 0

-AVX_INSTR psignb, 0, 0, 0

-AVX_INSTR psignw, 0, 0, 0

-AVX_INSTR psignd, 0, 0, 0

-AVX_INSTR psllw, 0, 0, 0

-AVX_INSTR pslld, 0, 0, 0

-AVX_INSTR psllq, 0, 0, 0

-AVX_INSTR pslldq, 0, 0, 0

-AVX_INSTR psraw, 0, 0, 0

-AVX_INSTR psrad, 0, 0, 0

-AVX_INSTR psrlw, 0, 0, 0

-AVX_INSTR psrld, 0, 0, 0

-AVX_INSTR psrlq, 0, 0, 0

-AVX_INSTR psrldq, 0, 0, 0

-AVX_INSTR psubb, 0, 0, 0

-AVX_INSTR psubw, 0, 0, 0

-AVX_INSTR psubd, 0, 0, 0

-AVX_INSTR psubq, 0, 0, 0

-AVX_INSTR psubsb, 0, 0, 0

-AVX_INSTR psubsw, 0, 0, 0

-AVX_INSTR psubusb, 0, 0, 0

-AVX_INSTR psubusw, 0, 0, 0

-AVX_INSTR ptest, 0, 0, 0

-AVX_INSTR punpckhbw, 0, 0, 0

-AVX_INSTR punpckhwd, 0, 0, 0

-AVX_INSTR punpckhdq, 0, 0, 0

-AVX_INSTR punpckhqdq, 0, 0, 0

-AVX_INSTR punpcklbw, 0, 0, 0

-AVX_INSTR punpcklwd, 0, 0, 0

-AVX_INSTR punpckldq, 0, 0, 0

-AVX_INSTR punpcklqdq, 0, 0, 0

-AVX_INSTR pxor, 0, 0, 1

-AVX_INSTR shufps, 1, 1, 0

-AVX_INSTR subpd, 1, 0, 0

-AVX_INSTR subps, 1, 0, 0

-AVX_INSTR subsd, 1, 0, 0

-AVX_INSTR subss, 1, 0, 0

-AVX_INSTR unpckhpd, 1, 0, 0

-AVX_INSTR unpckhps, 1, 0, 0

-AVX_INSTR unpcklpd, 1, 0, 0

-AVX_INSTR unpcklps, 1, 0, 0

-AVX_INSTR xorpd, 1, 0, 1

-AVX_INSTR xorps, 1, 0, 1

-; 3DNow instructions, for sharing code between AVX, SSE and 3DN

-AVX_INSTR pfadd, 1, 0, 1

-AVX_INSTR pfsub, 1, 0, 0

-AVX_INSTR pfmul, 1, 0, 1

-; base-4 constants for shuffles

-%assign i 0

-%rep 256

-    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)

-    %if j < 10

-        CAT_XDEFINE q000, j, i

-    %elif j < 100

-        CAT_XDEFINE q00, j, i

-    %elif j < 1000

-        CAT_XDEFINE q0, j, i

-    %else

-        CAT_XDEFINE q, j, i

-    %endif

-%assign i i+1

-%endrep

-%undef i

-%undef j

-%macro FMA_INSTR 3

-    %macro %1 4-7 %1, %2, %3

-        %if cpuflag(xop)

-            v%5 %1, %2, %3, %4

-        %else

-            %6 %1, %2, %3

-            %7 %1, %4

-        %endif

-    %endmacro

-%endmacro

-FMA_INSTR  pmacsdd,  pmulld, paddd

-FMA_INSTR  pmacsww,  pmullw, paddw

-FMA_INSTR pmadcswd, pmaddwd, paddd

-; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.

-; This lets us use tzcnt without bumping the yasm version requirement yet.

-%define tzcnt rep bsf