shithub: libvpx

--- a/examples.mk

+++ b/examples.mk

@@ -9,8 +9,12 @@

##

 LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \

+                third_party/libyuv/include/libyuv/convert.h \

+                third_party/libyuv/include/libyuv/convert_argb.h \

+                third_party/libyuv/include/libyuv/convert_from.h \

                 third_party/libyuv/include/libyuv/cpu_id.h  \

                 third_party/libyuv/include/libyuv/planar_functions.h  \

+                third_party/libyuv/include/libyuv/rotate.h  \

                 third_party/libyuv/include/libyuv/row.h  \

                 third_party/libyuv/include/libyuv/scale.h  \

                 third_party/libyuv/include/libyuv/scale_row.h  \

@@ -20,14 +24,15 @@

                 third_party/libyuv/source/row_common.cc \

                 third_party/libyuv/source/row_mips.cc \

                 third_party/libyuv/source/row_neon.cc \

+                third_party/libyuv/source/row_neon64.cc \

                 third_party/libyuv/source/row_posix.cc \

                 third_party/libyuv/source/row_win.cc \

-                third_party/libyuv/source/scale.cc  \

+                third_party/libyuv/source/scale.cc \

                 third_party/libyuv/source/scale_common.cc \

                 third_party/libyuv/source/scale_mips.cc \

                 third_party/libyuv/source/scale_neon.cc \

                 third_party/libyuv/source/scale_posix.cc \

-                third_party/libyuv/source/scale_win.cc

+                third_party/libyuv/source/scale_win.cc \

 LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \

                       third_party/libwebm/mkvmuxerutil.cpp \

@@ -210,8 +215,8 @@

 # from an installed tree or a version controlled tree. Determine

 # the proper paths.

 ifeq ($(HAVE_ALT_TREE_LAYOUT),yes)

-    LIB_PATH := $(SRC_PATH_BARE)/../lib

-    INC_PATH := $(SRC_PATH_BARE)/../include

+    LIB_PATH-yes := $(SRC_PATH_BARE)/../lib

+    INC_PATH-yes := $(SRC_PATH_BARE)/../include

 else

     LIB_PATH-yes                     += $(if $(BUILD_PFX),$(BUILD_PFX),.)

     INC_PATH-$(CONFIG_VP8_DECODER)   += $(SRC_PATH_BARE)/vp8

@@ -218,9 +223,10 @@

     INC_PATH-$(CONFIG_VP8_ENCODER)   += $(SRC_PATH_BARE)/vp8

     INC_PATH-$(CONFIG_VP9_DECODER)   += $(SRC_PATH_BARE)/vp9

     INC_PATH-$(CONFIG_VP9_ENCODER)   += $(SRC_PATH_BARE)/vp9

-    LIB_PATH := $(call enabled,LIB_PATH)

-    INC_PATH := $(call enabled,INC_PATH)

 endif

+INC_PATH-$(CONFIG_LIBYUV) += $(SRC_PATH_BARE)/third_party/libyuv/include

+LIB_PATH := $(call enabled,LIB_PATH)

+INC_PATH := $(call enabled,INC_PATH)

 INTERNAL_CFLAGS = $(addprefix -I,$(INC_PATH))

 INTERNAL_LDFLAGS += $(addprefix -L,$(LIB_PATH))

--- a/third_party/libyuv/README.libvpx

+++ b/third_party/libyuv/README.libvpx

@@ -1,6 +1,6 @@

 Name: libyuv

 URL: http://code.google.com/p/libyuv/

-Version: 1005

+Version: 1041

 License: BSD

 License File: LICENSE

@@ -13,5 +13,4 @@

 in order to encode multiple resolution bit streams.

 Local Modifications:

-Modified the original scaler code minimally with include file changes to fit

-in our current build system.

+None.

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/compare.h

@@ -1,0 +1,73 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT

+#define INCLUDE_LIBYUV_COMPARE_H_

+#include "libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Compute a hash for specified memory. Seed of 5381 recommended.

+LIBYUV_API

+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);

+// Sum Square Error - used to compute Mean Square Error or PSNR.

+LIBYUV_API

+uint64 ComputeSumSquareError(const uint8* src_a,

+                             const uint8* src_b, int count);

+LIBYUV_API

+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,

+                                  const uint8* src_b, int stride_b,

+                                  int width, int height);

+static const int kMaxPsnr = 128;

+LIBYUV_API

+double SumSquareErrorToPsnr(uint64 sse, uint64 count);

+LIBYUV_API

+double CalcFramePsnr(const uint8* src_a, int stride_a,

+                     const uint8* src_b, int stride_b,

+                     int width, int height);

+LIBYUV_API

+double I420Psnr(const uint8* src_y_a, int stride_y_a,

+                const uint8* src_u_a, int stride_u_a,

+                const uint8* src_v_a, int stride_v_a,

+                const uint8* src_y_b, int stride_y_b,

+                const uint8* src_u_b, int stride_u_b,

+                const uint8* src_v_b, int stride_v_b,

+                int width, int height);

+LIBYUV_API

+double CalcFrameSsim(const uint8* src_a, int stride_a,

+                     const uint8* src_b, int stride_b,

+                     int width, int height);

+LIBYUV_API

+double I420Ssim(const uint8* src_y_a, int stride_y_a,

+                const uint8* src_u_a, int stride_u_a,

+                const uint8* src_v_a, int stride_v_a,

+                const uint8* src_y_b, int stride_y_b,

+                const uint8* src_u_b, int stride_u_b,

+                const uint8* src_v_b, int stride_v_b,

+                int width, int height);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/convert.h

@@ -1,0 +1,254 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT

+#define INCLUDE_LIBYUV_CONVERT_H_

+#include "libyuv/basic_types.h"

+// TODO(fbarchard): Remove the following headers includes.

+#include "libyuv/convert_from.h"

+#include "libyuv/planar_functions.h"

+#include "libyuv/rotate.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Convert I444 to I420.

+LIBYUV_API

+int I444ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert I422 to I420.

+LIBYUV_API

+int I422ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert I411 to I420.

+LIBYUV_API

+int I411ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Copy I420 to I420.

+#define I420ToI420 I420Copy

+LIBYUV_API

+int I420Copy(const uint8* src_y, int src_stride_y,

+             const uint8* src_u, int src_stride_u,

+             const uint8* src_v, int src_stride_v,

+             uint8* dst_y, int dst_stride_y,

+             uint8* dst_u, int dst_stride_u,

+             uint8* dst_v, int dst_stride_v,

+             int width, int height);

+// Convert I400 (grey) to I420.

+LIBYUV_API

+int I400ToI420(const uint8* src_y, int src_stride_y,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert NV12 to I420.

+LIBYUV_API

+int NV12ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_uv, int src_stride_uv,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert NV21 to I420.

+LIBYUV_API

+int NV21ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_vu, int src_stride_vu,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert YUY2 to I420.

+LIBYUV_API

+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert UYVY to I420.

+LIBYUV_API

+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert M420 to I420.

+LIBYUV_API

+int M420ToI420(const uint8* src_m420, int src_stride_m420,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert Q420 to I420.

+LIBYUV_API

+int Q420ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_yuy2, int src_stride_yuy2,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// ARGB little endian (bgra in memory) to I420.

+LIBYUV_API

+int ARGBToI420(const uint8* src_frame, int src_stride_frame,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// BGRA little endian (argb in memory) to I420.

+LIBYUV_API

+int BGRAToI420(const uint8* src_frame, int src_stride_frame,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// ABGR little endian (rgba in memory) to I420.

+LIBYUV_API

+int ABGRToI420(const uint8* src_frame, int src_stride_frame,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// RGBA little endian (abgr in memory) to I420.

+LIBYUV_API

+int RGBAToI420(const uint8* src_frame, int src_stride_frame,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// RGB little endian (bgr in memory) to I420.

+LIBYUV_API

+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,

+                uint8* dst_y, int dst_stride_y,

+                uint8* dst_u, int dst_stride_u,

+                uint8* dst_v, int dst_stride_v,

+                int width, int height);

+// RGB big endian (rgb in memory) to I420.

+LIBYUV_API

+int RAWToI420(const uint8* src_frame, int src_stride_frame,

+              uint8* dst_y, int dst_stride_y,

+              uint8* dst_u, int dst_stride_u,

+              uint8* dst_v, int dst_stride_v,

+              int width, int height);

+// RGB16 (RGBP fourcc) little endian to I420.

+LIBYUV_API

+int RGB565ToI420(const uint8* src_frame, int src_stride_frame,

+                 uint8* dst_y, int dst_stride_y,

+                 uint8* dst_u, int dst_stride_u,

+                 uint8* dst_v, int dst_stride_v,

+                 int width, int height);

+// RGB15 (RGBO fourcc) little endian to I420.

+LIBYUV_API

+int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,

+                   uint8* dst_y, int dst_stride_y,

+                   uint8* dst_u, int dst_stride_u,

+                   uint8* dst_v, int dst_stride_v,

+                   int width, int height);

+// RGB12 (R444 fourcc) little endian to I420.

+LIBYUV_API

+int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,

+                   uint8* dst_y, int dst_stride_y,

+                   uint8* dst_u, int dst_stride_u,

+                   uint8* dst_v, int dst_stride_v,

+                   int width, int height);

+#ifdef HAVE_JPEG

+// src_width/height provided by capture.

+// dst_width/height for clipping determine final size.

+LIBYUV_API

+int MJPGToI420(const uint8* sample, size_t sample_size,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int src_width, int src_height,

+               int dst_width, int dst_height);

+// Query size of MJPG in pixels.

+LIBYUV_API

+int MJPGSize(const uint8* sample, size_t sample_size,

+             int* width, int* height);

+#endif

+// Note Bayer formats (BGGR) To I420 are in format_conversion.h

+// Convert camera sample to I420 with cropping, rotation and vertical flip.

+// "src_size" is needed to parse MJPG.

+// "dst_stride_y" number of bytes in a row of the dst_y plane.

+//   Normally this would be the same as dst_width, with recommended alignment

+//   to 16 bytes for better efficiency.

+//   If rotation of 90 or 270 is used, stride is affected. The caller should

+//   allocate the I420 buffer according to rotation.

+// "dst_stride_u" number of bytes in a row of the dst_u plane.

+//   Normally this would be the same as (dst_width + 1) / 2, with

+//   recommended alignment to 16 bytes for better efficiency.

+//   If rotation of 90 or 270 is used, stride is affected.

+// "crop_x" and "crop_y" are starting position for cropping.

+//   To center, crop_x = (src_width - dst_width) / 2

+//              crop_y = (src_height - dst_height) / 2

+// "src_width" / "src_height" is size of src_frame in pixels.

+//   "src_height" can be negative indicating a vertically flipped image source.

+// "crop_width" / "crop_height" is the size to crop the src to.

+//    Must be less than or equal to src_width/src_height

+//    Cropping parameters are pre-rotation.

+// "rotation" can be 0, 90, 180 or 270.

+// "format" is a fourcc. ie 'I420', 'YUY2'

+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.

+LIBYUV_API

+int ConvertToI420(const uint8* src_frame, size_t src_size,

+                  uint8* dst_y, int dst_stride_y,

+                  uint8* dst_u, int dst_stride_u,

+                  uint8* dst_v, int dst_stride_v,

+                  int crop_x, int crop_y,

+                  int src_width, int src_height,

+                  int crop_width, int crop_height,

+                  enum RotationMode rotation,

+                  uint32 format);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/convert_argb.h

@@ -1,0 +1,225 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT

+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_

+#include "libyuv/basic_types.h"

+// TODO(fbarchard): Remove the following headers includes

+#include "libyuv/convert_from.h"

+#include "libyuv/planar_functions.h"

+#include "libyuv/rotate.h"

+// TODO(fbarchard): This set of functions should exactly match convert.h

+// Add missing Q420.

+// TODO(fbarchard): Add tests. Create random content of right size and convert

+// with C vs Opt and or to I420 and compare.

+// TODO(fbarchard): Some of these functions lack parameter setting.

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Alias.

+#define ARGBToARGB ARGBCopy

+// Copy ARGB to ARGB.

+LIBYUV_API

+int ARGBCopy(const uint8* src_argb, int src_stride_argb,

+             uint8* dst_argb, int dst_stride_argb,

+             int width, int height);

+// Convert I420 to ARGB.

+LIBYUV_API

+int I420ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert I422 to ARGB.

+LIBYUV_API

+int I422ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert I444 to ARGB.

+LIBYUV_API

+int I444ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert I411 to ARGB.

+LIBYUV_API

+int I411ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert I400 (grey) to ARGB.

+LIBYUV_API

+int I400ToARGB(const uint8* src_y, int src_stride_y,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Alias.

+#define YToARGB I400ToARGB_Reference

+// Convert I400 to ARGB. Reverse of ARGBToI400.

+LIBYUV_API

+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,

+                         uint8* dst_argb, int dst_stride_argb,

+                         int width, int height);

+// Convert NV12 to ARGB.

+LIBYUV_API

+int NV12ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_uv, int src_stride_uv,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert NV21 to ARGB.

+LIBYUV_API

+int NV21ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_vu, int src_stride_vu,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert M420 to ARGB.

+LIBYUV_API

+int M420ToARGB(const uint8* src_m420, int src_stride_m420,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// TODO(fbarchard): Convert Q420 to ARGB.

+// LIBYUV_API

+// int Q420ToARGB(const uint8* src_y, int src_stride_y,

+//                const uint8* src_yuy2, int src_stride_yuy2,

+//                uint8* dst_argb, int dst_stride_argb,

+//                int width, int height);

+// Convert YUY2 to ARGB.

+LIBYUV_API

+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert UYVY to ARGB.

+LIBYUV_API

+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// BGRA little endian (argb in memory) to ARGB.

+LIBYUV_API

+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// ABGR little endian (rgba in memory) to ARGB.

+LIBYUV_API

+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// RGBA little endian (abgr in memory) to ARGB.

+LIBYUV_API

+int RGBAToARGB(const uint8* src_frame, int src_stride_frame,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Deprecated function name.

+#define BG24ToARGB RGB24ToARGB

+// RGB little endian (bgr in memory) to ARGB.

+LIBYUV_API

+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,

+                uint8* dst_argb, int dst_stride_argb,

+                int width, int height);

+// RGB big endian (rgb in memory) to ARGB.

+LIBYUV_API

+int RAWToARGB(const uint8* src_frame, int src_stride_frame,

+              uint8* dst_argb, int dst_stride_argb,

+              int width, int height);

+// RGB16 (RGBP fourcc) little endian to ARGB.

+LIBYUV_API

+int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,

+                 uint8* dst_argb, int dst_stride_argb,

+                 int width, int height);

+// RGB15 (RGBO fourcc) little endian to ARGB.

+LIBYUV_API

+int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,

+                   uint8* dst_argb, int dst_stride_argb,

+                   int width, int height);

+// RGB12 (R444 fourcc) little endian to ARGB.

+LIBYUV_API

+int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,

+                   uint8* dst_argb, int dst_stride_argb,

+                   int width, int height);

+#ifdef HAVE_JPEG

+// src_width/height provided by capture

+// dst_width/height for clipping determine final size.

+LIBYUV_API

+int MJPGToARGB(const uint8* sample, size_t sample_size,

+               uint8* dst_argb, int dst_stride_argb,

+               int src_width, int src_height,

+               int dst_width, int dst_height);

+#endif

+// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.

+// Convert camera sample to ARGB with cropping, rotation and vertical flip.

+// "src_size" is needed to parse MJPG.

+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.

+//   Normally this would be the same as dst_width, with recommended alignment

+//   to 16 bytes for better efficiency.

+//   If rotation of 90 or 270 is used, stride is affected. The caller should

+//   allocate the I420 buffer according to rotation.

+// "dst_stride_u" number of bytes in a row of the dst_u plane.

+//   Normally this would be the same as (dst_width + 1) / 2, with

+//   recommended alignment to 16 bytes for better efficiency.

+//   If rotation of 90 or 270 is used, stride is affected.

+// "crop_x" and "crop_y" are starting position for cropping.

+//   To center, crop_x = (src_width - dst_width) / 2

+//              crop_y = (src_height - dst_height) / 2

+// "src_width" / "src_height" is size of src_frame in pixels.

+//   "src_height" can be negative indicating a vertically flipped image source.

+// "crop_width" / "crop_height" is the size to crop the src to.

+//    Must be less than or equal to src_width/src_height

+//    Cropping parameters are pre-rotation.

+// "rotation" can be 0, 90, 180 or 270.

+// "format" is a fourcc. ie 'I420', 'YUY2'

+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.

+LIBYUV_API

+int ConvertToARGB(const uint8* src_frame, size_t src_size,

+                  uint8* dst_argb, int dst_stride_argb,

+                  int crop_x, int crop_y,

+                  int src_width, int src_height,

+                  int crop_width, int crop_height,

+                  enum RotationMode rotation,

+                  uint32 format);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/convert_from.h

@@ -1,0 +1,173 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT

+#define INCLUDE_LIBYUV_CONVERT_FROM_H_

+#include "libyuv/basic_types.h"

+#include "libyuv/rotate.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// See Also convert.h for conversions from formats to I420.

+// I420Copy in convert to I420ToI420.

+LIBYUV_API

+int I420ToI422(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+LIBYUV_API

+int I420ToI444(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+LIBYUV_API

+int I420ToI411(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.

+LIBYUV_API

+int I400Copy(const uint8* src_y, int src_stride_y,

+             uint8* dst_y, int dst_stride_y,

+             int width, int height);

+// TODO(fbarchard): I420ToM420

+// TODO(fbarchard): I420ToQ420

+LIBYUV_API

+int I420ToNV12(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_uv, int dst_stride_uv,

+               int width, int height);

+LIBYUV_API

+int I420ToNV21(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_vu, int dst_stride_vu,

+               int width, int height);

+LIBYUV_API

+int I420ToYUY2(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_frame, int dst_stride_frame,

+               int width, int height);

+LIBYUV_API

+int I420ToUYVY(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_frame, int dst_stride_frame,

+               int width, int height);

+LIBYUV_API

+int I420ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+LIBYUV_API

+int I420ToBGRA(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+LIBYUV_API

+int I420ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+LIBYUV_API

+int I420ToRGBA(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_rgba, int dst_stride_rgba,

+               int width, int height);

+LIBYUV_API

+int I420ToRGB24(const uint8* src_y, int src_stride_y,

+                const uint8* src_u, int src_stride_u,

+                const uint8* src_v, int src_stride_v,

+                uint8* dst_frame, int dst_stride_frame,

+                int width, int height);

+LIBYUV_API

+int I420ToRAW(const uint8* src_y, int src_stride_y,

+              const uint8* src_u, int src_stride_u,

+              const uint8* src_v, int src_stride_v,

+              uint8* dst_frame, int dst_stride_frame,

+              int width, int height);

+LIBYUV_API

+int I420ToRGB565(const uint8* src_y, int src_stride_y,

+                 const uint8* src_u, int src_stride_u,

+                 const uint8* src_v, int src_stride_v,

+                 uint8* dst_frame, int dst_stride_frame,

+                 int width, int height);

+LIBYUV_API

+int I420ToARGB1555(const uint8* src_y, int src_stride_y,

+                   const uint8* src_u, int src_stride_u,

+                   const uint8* src_v, int src_stride_v,

+                   uint8* dst_frame, int dst_stride_frame,

+                   int width, int height);

+LIBYUV_API

+int I420ToARGB4444(const uint8* src_y, int src_stride_y,

+                   const uint8* src_u, int src_stride_u,

+                   const uint8* src_v, int src_stride_v,

+                   uint8* dst_frame, int dst_stride_frame,

+                   int width, int height);

+// Note Bayer formats (BGGR) To I420 are in format_conversion.h.

+// Convert I420 to specified format.

+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the

+//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.

+LIBYUV_API

+int ConvertFromI420(const uint8* y, int y_stride,

+                    const uint8* u, int u_stride,

+                    const uint8* v, int v_stride,

+                    uint8* dst_sample, int dst_sample_stride,

+                    int width, int height,

+                    uint32 format);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/convert_from_argb.h

@@ -1,0 +1,166 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT

+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_

+#include "libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Copy ARGB to ARGB.

+#define ARGBToARGB ARGBCopy

+LIBYUV_API

+int ARGBCopy(const uint8* src_argb, int src_stride_argb,

+             uint8* dst_argb, int dst_stride_argb,

+             int width, int height);

+// Convert ARGB To BGRA.

+LIBYUV_API

+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_bgra, int dst_stride_bgra,

+               int width, int height);

+// Convert ARGB To ABGR.

+LIBYUV_API

+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height);

+// Convert ARGB To RGBA.

+LIBYUV_API

+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_rgba, int dst_stride_rgba,

+               int width, int height);

+// Convert ARGB To RGB24.

+LIBYUV_API

+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,

+                uint8* dst_rgb24, int dst_stride_rgb24,

+                int width, int height);

+// Convert ARGB To RAW.

+LIBYUV_API

+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,

+              uint8* dst_rgb, int dst_stride_rgb,

+              int width, int height);

+// Convert ARGB To RGB565.

+LIBYUV_API

+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,

+                 uint8* dst_rgb565, int dst_stride_rgb565,

+                 int width, int height);

+// Convert ARGB To ARGB1555.

+LIBYUV_API

+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,

+                   uint8* dst_argb1555, int dst_stride_argb1555,

+                   int width, int height);

+// Convert ARGB To ARGB4444.

+LIBYUV_API

+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,

+                   uint8* dst_argb4444, int dst_stride_argb4444,

+                   int width, int height);

+// Convert ARGB To I444.

+LIBYUV_API

+int ARGBToI444(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert ARGB To I422.

+LIBYUV_API

+int ARGBToI422(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert ARGB To I420. (also in convert.h)

+LIBYUV_API

+int ARGBToI420(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert ARGB to J420. (JPeg full range I420).

+LIBYUV_API

+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_yj, int dst_stride_yj,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert ARGB To I411.

+LIBYUV_API

+int ARGBToI411(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert ARGB to J400. (JPeg full range).

+LIBYUV_API

+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_yj, int dst_stride_yj,

+               int width, int height);

+// Convert ARGB to I400.

+LIBYUV_API

+int ARGBToI400(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height);

+// Convert ARGB To NV12.

+LIBYUV_API

+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_uv, int dst_stride_uv,

+               int width, int height);

+// Convert ARGB To NV21.

+LIBYUV_API

+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_vu, int dst_stride_vu,

+               int width, int height);

+// Convert ARGB To NV21.

+LIBYUV_API

+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_vu, int dst_stride_vu,

+               int width, int height);

+// Convert ARGB To YUY2.

+LIBYUV_API

+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_yuy2, int dst_stride_yuy2,

+               int width, int height);

+// Convert ARGB To UYVY.

+LIBYUV_API

+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_uyvy, int dst_stride_uyvy,

+               int width, int height);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT

--- a/third_party/libyuv/include/libyuv/cpu_id.h

+++ b/third_party/libyuv/include/libyuv/cpu_id.h

@@ -11,7 +11,7 @@

 #ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT

 #define INCLUDE_LIBYUV_CPU_ID_H_

-#include "basic_types.h"

+#include "libyuv/basic_types.h"

 #ifdef __cplusplus

 namespace libyuv {

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/format_conversion.h

@@ -1,0 +1,168 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_  // NOLINT

+#define INCLUDE_LIBYUV_FORMATCONVERSION_H_

+#include "libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Convert Bayer RGB formats to I420.

+LIBYUV_API

+int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_y, int dst_stride_y,

+                    uint8* dst_u, int dst_stride_u,

+                    uint8* dst_v, int dst_stride_v,

+                    int width, int height);

+LIBYUV_API

+int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_y, int dst_stride_y,

+                    uint8* dst_u, int dst_stride_u,

+                    uint8* dst_v, int dst_stride_v,

+                    int width, int height);

+LIBYUV_API

+int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_y, int dst_stride_y,

+                    uint8* dst_u, int dst_stride_u,

+                    uint8* dst_v, int dst_stride_v,

+                    int width, int height);

+LIBYUV_API

+int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_y, int dst_stride_y,

+                    uint8* dst_u, int dst_stride_u,

+                    uint8* dst_v, int dst_stride_v,

+                    int width, int height);

+// Temporary API mapper.

+#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \

+    BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)

+LIBYUV_API

+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,

+                uint8* dst_y, int dst_stride_y,

+                uint8* dst_u, int dst_stride_u,

+                uint8* dst_v, int dst_stride_v,

+                int width, int height,

+                uint32 src_fourcc_bayer);

+// Convert I420 to Bayer RGB formats.

+LIBYUV_API

+int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,

+                    const uint8* src_u, int src_stride_u,

+                    const uint8* src_v, int src_stride_v,

+                    uint8* dst_frame, int dst_stride_frame,

+                    int width, int height);

+LIBYUV_API

+int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,

+                    const uint8* src_u, int src_stride_u,

+                    const uint8* src_v, int src_stride_v,

+                    uint8* dst_frame, int dst_stride_frame,

+                    int width, int height);

+LIBYUV_API

+int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,

+                    const uint8* src_u, int src_stride_u,

+                    const uint8* src_v, int src_stride_v,

+                    uint8* dst_frame, int dst_stride_frame,

+                    int width, int height);

+LIBYUV_API

+int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,

+                    const uint8* src_u, int src_stride_u,

+                    const uint8* src_v, int src_stride_v,

+                    uint8* dst_frame, int dst_stride_frame,

+                    int width, int height);

+// Temporary API mapper.

+#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \

+    I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)

+LIBYUV_API

+int I420ToBayer(const uint8* src_y, int src_stride_y,

+                const uint8* src_u, int src_stride_u,

+                const uint8* src_v, int src_stride_v,

+                uint8* dst_frame, int dst_stride_frame,

+                int width, int height,

+                uint32 dst_fourcc_bayer);

+// Convert Bayer RGB formats to ARGB.

+LIBYUV_API

+int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height);

+LIBYUV_API

+int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height);

+LIBYUV_API

+int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height);

+LIBYUV_API

+int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height);

+// Temporary API mapper.

+#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)

+LIBYUV_API

+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,

+                uint8* dst_argb, int dst_stride_argb,

+                int width, int height,

+                uint32 src_fourcc_bayer);

+// Converts ARGB to Bayer RGB formats.

+LIBYUV_API

+int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,

+                    uint8* dst_bayer, int dst_stride_bayer,

+                    int width, int height);

+LIBYUV_API

+int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,

+                    uint8* dst_bayer, int dst_stride_bayer,

+                    int width, int height);

+LIBYUV_API

+int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,

+                    uint8* dst_bayer, int dst_stride_bayer,

+                    int width, int height);

+LIBYUV_API

+int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,

+                    uint8* dst_bayer, int dst_stride_bayer,

+                    int width, int height);

+// Temporary API mapper.

+#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)

+LIBYUV_API

+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,

+                uint8* dst_bayer, int dst_stride_bayer,

+                int width, int height,

+                uint32 dst_fourcc_bayer);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_FORMATCONVERSION_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/mjpeg_decoder.h

@@ -1,0 +1,193 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT

+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_

+#include "libyuv/basic_types.h"

+#ifdef __cplusplus

+// NOTE: For a simplified public API use convert.h MJPGToI420().

+struct jpeg_common_struct;

+struct jpeg_decompress_struct;

+struct jpeg_source_mgr;

+namespace libyuv {

+#ifdef __cplusplus

+extern "C" {

+#endif

+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);

+#ifdef __cplusplus

+}  // extern "C"

+#endif

+static const uint32 kUnknownDataSize = 0xFFFFFFFF;

+enum JpegSubsamplingType {

+  kJpegYuv420,

+  kJpegYuv422,

+  kJpegYuv411,

+  kJpegYuv444,

+  kJpegYuv400,

+  kJpegUnknown

+};

+struct Buffer {

+  const uint8* data;

+  int len;

+};

+struct BufferVector {

+  Buffer* buffers;

+  int len;

+  int pos;

+};

+struct SetJmpErrorMgr;

+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are

+// simply independent JPEG images with a fixed huffman table (which is omitted).

+// It is rarely used in video transmission, but is common as a camera capture

+// format, especially in Logitech devices. This class implements a decoder for

+// MJPEG frames.

+//

+// See http://tools.ietf.org/html/rfc2435

+class LIBYUV_API MJpegDecoder {

+ public:

+  typedef void (*CallbackFunction)(void* opaque,

+                                   const uint8* const* data,

+                                   const int* strides,

+                                   int rows);

+  static const int kColorSpaceUnknown;

+  static const int kColorSpaceGrayscale;

+  static const int kColorSpaceRgb;

+  static const int kColorSpaceYCbCr;

+  static const int kColorSpaceCMYK;

+  static const int kColorSpaceYCCK;

+  MJpegDecoder();

+  ~MJpegDecoder();

+  // Loads a new frame, reads its headers, and determines the uncompressed

+  // image format.

+  // Returns LIBYUV_TRUE if image looks valid and format is supported.

+  // If return value is LIBYUV_TRUE, then the values for all the following

+  // getters are populated.

+  // src_len is the size of the compressed mjpeg frame in bytes.

+  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);

+  // Returns width of the last loaded frame in pixels.

+  int GetWidth();

+  // Returns height of the last loaded frame in pixels.

+  int GetHeight();

+  // Returns format of the last loaded frame. The return value is one of the

+  // kColorSpace* constants.

+  int GetColorSpace();

+  // Number of color components in the color space.

+  int GetNumComponents();

+  // Sample factors of the n-th component.

+  int GetHorizSampFactor(int component);

+  int GetVertSampFactor(int component);

+  int GetHorizSubSampFactor(int component);

+  int GetVertSubSampFactor(int component);

+  // Public for testability.

+  int GetImageScanlinesPerImcuRow();

+  // Public for testability.

+  int GetComponentScanlinesPerImcuRow(int component);

+  // Width of a component in bytes.

+  int GetComponentWidth(int component);

+  // Height of a component.

+  int GetComponentHeight(int component);

+  // Width of a component in bytes with padding for DCTSIZE. Public for testing.

+  int GetComponentStride(int component);

+  // Size of a component in bytes.

+  int GetComponentSize(int component);

+  // Call this after LoadFrame() if you decide you don't want to decode it

+  // after all.

+  LIBYUV_BOOL UnloadFrame();

+  // Decodes the entire image into a one-buffer-per-color-component format.

+  // dst_width must match exactly. dst_height must be <= to image height; if

+  // less, the image is cropped. "planes" must have size equal to at least

+  // GetNumComponents() and they must point to non-overlapping buffers of size

+  // at least GetComponentSize(i). The pointers in planes are incremented

+  // to point to after the end of the written data.

+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.

+  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);

+  // Decodes the entire image and passes the data via repeated calls to a

+  // callback function. Each call will get the data for a whole number of

+  // image scanlines.

+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.

+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,

+                        int dst_width, int dst_height);

+  // The helper function which recognizes the jpeg sub-sampling type.

+  static JpegSubsamplingType JpegSubsamplingTypeHelper(

+     int* subsample_x, int* subsample_y, int number_of_components);

+ private:

+  void AllocOutputBuffers(int num_outbufs);

+  void DestroyOutputBuffers();

+  LIBYUV_BOOL StartDecode();

+  LIBYUV_BOOL FinishDecode();

+  void SetScanlinePointers(uint8** data);

+  LIBYUV_BOOL DecodeImcuRow();

+  int GetComponentScanlinePadding(int component);

+  // A buffer holding the input data for a frame.

+  Buffer buf_;

+  BufferVector buf_vec_;

+  jpeg_decompress_struct* decompress_struct_;

+  jpeg_source_mgr* source_mgr_;

+  SetJmpErrorMgr* error_mgr_;

+  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,

+  // GetComponentScanlinePadding() != 0.)

+  LIBYUV_BOOL has_scanline_padding_;

+  // Temporaries used to point to scanline outputs.

+  int num_outbufs_;  // Outermost size of all arrays below.

+  uint8*** scanlines_;

+  int* scanlines_sizes_;

+  // Temporary buffer used for decoding when we can't decode directly to the

+  // output buffers. Large enough for just one iMCU row.

+  uint8** databuf_;

+  int* databuf_strides_;

+};

+}  // namespace libyuv

+#endif  //  __cplusplus

+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT

--- a/third_party/libyuv/include/libyuv/planar_functions.h

+++ b/third_party/libyuv/include/libyuv/planar_functions.h

@@ -11,11 +11,11 @@

 #ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT

 #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_

-#include "basic_types.h"

+#include "libyuv/basic_types.h"

 // TODO(fbarchard): Remove the following headers includes.

-// #include "convert.h"

-// #include "convert_argb.h"

+#include "libyuv/convert.h"

+#include "libyuv/convert_argb.h"

 #ifdef __cplusplus

 namespace libyuv {

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/rotate.h

@@ -1,0 +1,117 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT

+#define INCLUDE_LIBYUV_ROTATE_H_

+#include "libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Supported rotation.

+typedef enum RotationMode {

+  kRotate0 = 0,  // No rotation.

+  kRotate90 = 90,  // Rotate 90 degrees clockwise.

+  kRotate180 = 180,  // Rotate 180 degrees.

+  kRotate270 = 270,  // Rotate 270 degrees clockwise.

+  // Deprecated.

+  kRotateNone = 0,

+  kRotateClockwise = 90,

+  kRotateCounterClockwise = 270,

+} RotationModeEnum;

+// Rotate I420 frame.

+LIBYUV_API

+int I420Rotate(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int src_width, int src_height, enum RotationMode mode);

+// Rotate NV12 input and store in I420.

+LIBYUV_API

+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,

+                     const uint8* src_uv, int src_stride_uv,

+                     uint8* dst_y, int dst_stride_y,

+                     uint8* dst_u, int dst_stride_u,

+                     uint8* dst_v, int dst_stride_v,

+                     int src_width, int src_height, enum RotationMode mode);

+// Rotate a plane by 0, 90, 180, or 270.

+LIBYUV_API

+int RotatePlane(const uint8* src, int src_stride,

+                uint8* dst, int dst_stride,

+                int src_width, int src_height, enum RotationMode mode);

+// Rotate planes by 90, 180, 270. Deprecated.

+LIBYUV_API

+void RotatePlane90(const uint8* src, int src_stride,

+                   uint8* dst, int dst_stride,

+                   int width, int height);

+LIBYUV_API

+void RotatePlane180(const uint8* src, int src_stride,

+                    uint8* dst, int dst_stride,

+                    int width, int height);

+LIBYUV_API

+void RotatePlane270(const uint8* src, int src_stride,

+                    uint8* dst, int dst_stride,

+                    int width, int height);

+LIBYUV_API

+void RotateUV90(const uint8* src, int src_stride,

+                uint8* dst_a, int dst_stride_a,

+                uint8* dst_b, int dst_stride_b,

+                int width, int height);

+// Rotations for when U and V are interleaved.

+// These functions take one input pointer and

+// split the data into two buffers while

+// rotating them. Deprecated.

+LIBYUV_API

+void RotateUV180(const uint8* src, int src_stride,

+                 uint8* dst_a, int dst_stride_a,

+                 uint8* dst_b, int dst_stride_b,

+                 int width, int height);

+LIBYUV_API

+void RotateUV270(const uint8* src, int src_stride,

+                 uint8* dst_a, int dst_stride_a,

+                 uint8* dst_b, int dst_stride_b,

+                 int width, int height);

+// The 90 and 270 functions are based on transposes.

+// Doing a transpose with reversing the read/write

+// order will result in a rotation by +- 90 degrees.

+// Deprecated.

+LIBYUV_API

+void TransposePlane(const uint8* src, int src_stride,

+                    uint8* dst, int dst_stride,

+                    int width, int height);

+LIBYUV_API

+void TransposeUV(const uint8* src, int src_stride,

+                 uint8* dst_a, int dst_stride_a,

+                 uint8* dst_b, int dst_stride_b,

+                 int width, int height);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/rotate_argb.h

@@ -1,0 +1,33 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT

+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_

+#include "libyuv/basic_types.h"

+#include "libyuv/rotate.h"  // For RotationMode.

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Rotate ARGB frame

+LIBYUV_API

+int ARGBRotate(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_argb, int dst_stride_argb,

+               int src_width, int src_height, enum RotationMode mode);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT

--- a/third_party/libyuv/include/libyuv/row.h

+++ b/third_party/libyuv/include/libyuv/row.h

@@ -13,8 +13,12 @@

 #include <stdlib.h>  // For malloc.

-#include "basic_types.h"

+#include "libyuv/basic_types.h"

+#if defined(__native_client__)

+#include "ppapi/c/pp_macros.h"  // For PPAPI_RELEASE

+#endif

 #ifdef __cplusplus

 namespace libyuv {

 extern "C" {

@@ -38,7 +42,8 @@

   var = 0

 #if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \

-    defined(TARGET_IPHONE_SIMULATOR)

+    defined(TARGET_IPHONE_SIMULATOR) || \

+    (defined(_MSC_VER) && defined(__clang__))

 #define LIBYUV_DISABLE_X86

 #endif

 // True if compiling for SSSE3 as a requirement.

@@ -47,7 +52,12 @@

 #endif

 // Enable for NaCL pepper 33 for bundle and AVX2 support.

-//  #define NEW_BINUTILS

+#if defined(__native_client__) && PPAPI_RELEASE >= 33

+#define NEW_BINUTILS

+#endif

+#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37

+#define LIBYUV_DISABLE_NEON

+#endif

 // The following are available on all x86 platforms:

 #if !defined(LIBYUV_DISABLE_X86) && \

@@ -152,6 +162,11 @@

 #define HAS_YUY2TOYROW_SSE2

 #endif

+// The following are available on x64 Visual C:

+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)

+#define HAS_I422TOARGBROW_SSSE3

+#endif

 // GCC >= 4.7.0 required for AVX2.

 #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))

 #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))

@@ -235,6 +250,10 @@

 #define HAS_MIRRORROW_SSE2

 #endif

+// The following are available on arm64 platforms:

+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

+#endif

 // The following are available on Neon platforms:

 #if !defined(LIBYUV_DISABLE_NEON) && \

     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))

@@ -330,7 +349,8 @@

 #endif

 // The following are available on Mips platforms:

-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)

+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \

+    (_MIPS_SIM == _MIPS_SIM_ABI32)

 #define HAS_COPYROW_MIPS

 #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)

 #define HAS_I422TOABGRROW_MIPS_DSPR2

@@ -426,7 +446,7 @@

     "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \

     #opcode " (%%r15,%%r14),%" #arg "\n" \

     BUNDLEUNLOCK

-#else

+#else  // defined(__native_client__) && defined(__x86_64__)

 #define BUNDLEALIGN "\n"

 #define MEMACCESS(base) "(%" #base ")"

 #define MEMACCESS2(offset, base) #offset "(%" #base ")"

@@ -443,6 +463,15 @@

     #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"

 #define MEMOPARG(opcode, offset, base, index, scale, arg) \

     #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"

+#endif  // defined(__native_client__) && defined(__x86_64__)

+#if defined(__arm__)

+#undef MEMACCESS

+#if defined(__native_client__)

+#define MEMACCESS(base) ".p2align   3\nbic %" #base ", #0xc0000000\n"

+#else

+#define MEMACCESS(base) "\n"

+#endif

 #endif

 void I444ToARGBRow_NEON(const uint8* src_y,

--- a/third_party/libyuv/include/libyuv/scale.h

+++ b/third_party/libyuv/include/libyuv/scale.h

@@ -11,7 +11,7 @@

 #ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT

 #define INCLUDE_LIBYUV_SCALE_H_

-#include "basic_types.h"

+#include "libyuv/basic_types.h"

 #ifdef __cplusplus

 namespace libyuv {

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/scale_argb.h

@@ -1,0 +1,57 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT

+#define INCLUDE_LIBYUV_SCALE_ARGB_H_

+#include "libyuv/basic_types.h"

+#include "libyuv/scale.h"  // For FilterMode

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+LIBYUV_API

+int ARGBScale(const uint8* src_argb, int src_stride_argb,

+              int src_width, int src_height,

+              uint8* dst_argb, int dst_stride_argb,

+              int dst_width, int dst_height,

+              enum FilterMode filtering);

+// Clipped scale takes destination rectangle coordinates for clip values.

+LIBYUV_API

+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,

+                  int src_width, int src_height,

+                  uint8* dst_argb, int dst_stride_argb,

+                  int dst_width, int dst_height,

+                  int clip_x, int clip_y, int clip_width, int clip_height,

+                  enum FilterMode filtering);

+// TODO(fbarchard): Implement this.

+// Scale with YUV conversion to ARGB and clipping.

+LIBYUV_API

+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,

+                       const uint8* src_u, int src_stride_u,

+                       const uint8* src_v, int src_stride_v,

+                       uint32 src_fourcc,

+                       int src_width, int src_height,

+                       uint8* dst_argb, int dst_stride_argb,

+                       uint32 dst_fourcc,

+                       int dst_width, int dst_height,

+                       int clip_x, int clip_y, int clip_width, int clip_height,

+                       enum FilterMode filtering);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT

--- a/third_party/libyuv/include/libyuv/scale_row.h

+++ b/third_party/libyuv/include/libyuv/scale_row.h

@@ -11,7 +11,7 @@

 #ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT

 #define INCLUDE_LIBYUV_SCALE_ROW_H_

-#include "basic_types.h"

+#include "libyuv/basic_types.h"

 #ifdef __cplusplus

 namespace libyuv {

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/version.h

@@ -1,0 +1,16 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT

+#define INCLUDE_LIBYUV_VERSION_H_

+#define LIBYUV_VERSION 1041

+#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/video_common.h

@@ -1,0 +1,182 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+// Common definitions for video, including fourcc and VideoFormat.

+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT

+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_

+#include "libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+//////////////////////////////////////////////////////////////////////////////

+// Definition of FourCC codes

+//////////////////////////////////////////////////////////////////////////////

+// Convert four characters to a FourCC code.

+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*

+// constants are used in a switch.

+#ifdef __cplusplus

+#define FOURCC(a, b, c, d) ( \

+    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \

+    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))

+#else

+#define FOURCC(a, b, c, d) ( \

+    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \

+    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */

+#endif

+// Some pages discussing FourCC codes:

+//   http://www.fourcc.org/yuv.php

+//   http://v4l2spec.bytesex.org/spec/book1.htm

+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html

+//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12

+//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt

+// FourCC codes grouped according to implementation efficiency.

+// Primary formats should convert in 1 efficient step.

+// Secondary formats are converted in 2 steps.

+// Auxilliary formats call primary converters.

+enum FourCC {

+  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.

+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),

+  FOURCC_I422 = FOURCC('I', '4', '2', '2'),

+  FOURCC_I444 = FOURCC('I', '4', '4', '4'),

+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),

+  FOURCC_I400 = FOURCC('I', '4', '0', '0'),

+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),

+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),

+  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),

+  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),

+  // 2 Secondary YUV formats: row biplanar.

+  FOURCC_M420 = FOURCC('M', '4', '2', '0'),

+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),

+  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.

+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),

+  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),

+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),

+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),

+  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),

+  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),

+  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.

+  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.

+  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.

+  // 4 Secondary RGB formats: 4 Bayer Patterns.

+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),

+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),

+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),

+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),

+  // 1 Primary Compressed YUV format.

+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),

+  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.

+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),

+  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),

+  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),

+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.

+  FOURCC_J420 = FOURCC('J', '4', '2', '0'),

+  FOURCC_J400 = FOURCC('J', '4', '0', '0'),

+  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.

+  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.

+  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.

+  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.

+  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.

+  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.

+  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.

+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.

+  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.

+  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.

+  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.

+  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.

+  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.

+  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB

+  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB

+  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.

+  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.

+  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.

+  // 1 Auxiliary compressed YUV format set aside for capturer.

+  FOURCC_H264 = FOURCC('H', '2', '6', '4'),

+  // Match any fourcc.

+  FOURCC_ANY = -1,

+};

+enum FourCCBpp {

+  // Canonical fourcc codes used in our code.

+  FOURCC_BPP_I420 = 12,

+  FOURCC_BPP_I422 = 16,

+  FOURCC_BPP_I444 = 24,

+  FOURCC_BPP_I411 = 12,

+  FOURCC_BPP_I400 = 8,

+  FOURCC_BPP_NV21 = 12,

+  FOURCC_BPP_NV12 = 12,

+  FOURCC_BPP_YUY2 = 16,

+  FOURCC_BPP_UYVY = 16,

+  FOURCC_BPP_M420 = 12,

+  FOURCC_BPP_Q420 = 12,

+  FOURCC_BPP_ARGB = 32,

+  FOURCC_BPP_BGRA = 32,

+  FOURCC_BPP_ABGR = 32,

+  FOURCC_BPP_RGBA = 32,

+  FOURCC_BPP_24BG = 24,

+  FOURCC_BPP_RAW  = 24,

+  FOURCC_BPP_RGBP = 16,

+  FOURCC_BPP_RGBO = 16,

+  FOURCC_BPP_R444 = 16,

+  FOURCC_BPP_RGGB = 8,

+  FOURCC_BPP_BGGR = 8,

+  FOURCC_BPP_GRBG = 8,

+  FOURCC_BPP_GBRG = 8,

+  FOURCC_BPP_YV12 = 12,

+  FOURCC_BPP_YV16 = 16,

+  FOURCC_BPP_YV24 = 24,

+  FOURCC_BPP_YU12 = 12,

+  FOURCC_BPP_J420 = 12,

+  FOURCC_BPP_J400 = 8,

+  FOURCC_BPP_MJPG = 0,  // 0 means unknown.

+  FOURCC_BPP_H264 = 0,

+  FOURCC_BPP_IYUV = 12,

+  FOURCC_BPP_YU16 = 16,

+  FOURCC_BPP_YU24 = 24,

+  FOURCC_BPP_YUYV = 16,

+  FOURCC_BPP_YUVS = 16,

+  FOURCC_BPP_HDYC = 16,

+  FOURCC_BPP_2VUY = 16,

+  FOURCC_BPP_JPEG = 1,

+  FOURCC_BPP_DMB1 = 1,

+  FOURCC_BPP_BA81 = 8,

+  FOURCC_BPP_RGB3 = 24,

+  FOURCC_BPP_BGR3 = 24,

+  FOURCC_BPP_CM32 = 32,

+  FOURCC_BPP_CM24 = 24,

+  // Match any fourcc.

+  FOURCC_BPP_ANY  = 0,  // 0 means unknown.

+};

+// Converts fourcc aliases into canonical ones.

+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/source/compare.cc

@@ -1,0 +1,325 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/compare.h"

+#include <float.h>

+#include <math.h>

+#ifdef _OPENMP

+#include <omp.h>

+#endif

+#include "libyuv/basic_types.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// hash seed of 5381 recommended.

+// Internal C version of HashDjb2 with int sized count for efficiency.

+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);

+// This module is for Visual C x86

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(_M_IX86) || \

+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))

+#define HAS_HASHDJB2_SSE41

+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);

+#if _MSC_VER >= 1700

+#define HAS_HASHDJB2_AVX2

+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);

+#endif

+#endif  // HAS_HASHDJB2_SSE41

+// hash seed of 5381 recommended.

+LIBYUV_API

+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {

+  const int kBlockSize = 1 << 15;  // 32768;

+  int remainder;

+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;

+#if defined(HAS_HASHDJB2_SSE41)

+  if (TestCpuFlag(kCpuHasSSE41)) {

+    HashDjb2_SSE = HashDjb2_SSE41;

+  }

+#endif

+#if defined(HAS_HASHDJB2_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    HashDjb2_SSE = HashDjb2_AVX2;

+  }

+#endif

+  while (count >= (uint64)(kBlockSize)) {

+    seed = HashDjb2_SSE(src, kBlockSize, seed);

+    src += kBlockSize;

+    count -= kBlockSize;

+  }

+  remainder = (int)(count) & ~15;

+  if (remainder) {

+    seed = HashDjb2_SSE(src, remainder, seed);

+    src += remainder;

+    count -= remainder;

+  }

+  remainder = (int)(count) & 15;

+  if (remainder) {

+    seed = HashDjb2_C(src, remainder, seed);

+  }

+  return seed;

+}

+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);

+#if !defined(LIBYUV_DISABLE_NEON) && \

+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))

+#define HAS_SUMSQUAREERROR_NEON

+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);

+#endif

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

+#define HAS_SUMSQUAREERROR_SSE2

+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);

+#endif

+// Visual C 2012 required for AVX2.

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700

+#define HAS_SUMSQUAREERROR_AVX2

+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);

+#endif

+// TODO(fbarchard): Refactor into row function.

+LIBYUV_API

+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,

+                             int count) {

+  // SumSquareError returns values 0 to 65535 for each squared difference.

+  // Up to 65536 of those can be summed and remain within a uint32.

+  // After each block of 65536 pixels, accumulate into a uint64.

+  const int kBlockSize = 65536;

+  int remainder = count & (kBlockSize - 1) & ~31;

+  uint64 sse = 0;

+  int i;

+  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =

+      SumSquareError_C;

+#if defined(HAS_SUMSQUAREERROR_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    SumSquareError = SumSquareError_NEON;

+  }

+#endif

+#if defined(HAS_SUMSQUAREERROR_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) &&

+      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {

+    // Note only used for multiples of 16 so count is not checked.

+    SumSquareError = SumSquareError_SSE2;

+  }

+#endif

+#if defined(HAS_SUMSQUAREERROR_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    // Note only used for multiples of 32 so count is not checked.

+    SumSquareError = SumSquareError_AVX2;

+  }

+#endif

+#ifdef _OPENMP

+#pragma omp parallel for reduction(+: sse)

+#endif

+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {

+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);

+  }

+  src_a += count & ~(kBlockSize - 1);

+  src_b += count & ~(kBlockSize - 1);

+  if (remainder) {

+    sse += SumSquareError(src_a, src_b, remainder);

+    src_a += remainder;

+    src_b += remainder;

+  }

+  remainder = count & 31;

+  if (remainder) {

+    sse += SumSquareError_C(src_a, src_b, remainder);

+  }

+  return sse;

+}

+LIBYUV_API

+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,

+                                  const uint8* src_b, int stride_b,

+                                  int width, int height) {

+  uint64 sse = 0;

+  int h;

+  // Coalesce rows.

+  if (stride_a == width &&

+      stride_b == width) {

+    width *= height;

+    height = 1;

+    stride_a = stride_b = 0;

+  }

+  for (h = 0; h < height; ++h) {

+    sse += ComputeSumSquareError(src_a, src_b, width);

+    src_a += stride_a;

+    src_b += stride_b;

+  }

+  return sse;

+}

+LIBYUV_API

+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {

+  double psnr;

+  if (sse > 0) {

+    double mse = (double)(count) / (double)(sse);

+    psnr = 10.0 * log10(255.0 * 255.0 * mse);

+  } else {

+    psnr = kMaxPsnr;      // Limit to prevent divide by 0

+  }

+  if (psnr > kMaxPsnr)

+    psnr = kMaxPsnr;

+  return psnr;

+}

+LIBYUV_API

+double CalcFramePsnr(const uint8* src_a, int stride_a,

+                     const uint8* src_b, int stride_b,

+                     int width, int height) {

+  const uint64 samples = width * height;

+  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,

+                                                src_b, stride_b,

+                                                width, height);

+  return SumSquareErrorToPsnr(sse, samples);

+}

+LIBYUV_API

+double I420Psnr(const uint8* src_y_a, int stride_y_a,

+                const uint8* src_u_a, int stride_u_a,

+                const uint8* src_v_a, int stride_v_a,

+                const uint8* src_y_b, int stride_y_b,

+                const uint8* src_u_b, int stride_u_b,

+                const uint8* src_v_b, int stride_v_b,

+                int width, int height) {

+  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,

+                                                  src_y_b, stride_y_b,

+                                                  width, height);

+  const int width_uv = (width + 1) >> 1;

+  const int height_uv = (height + 1) >> 1;

+  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,

+                                                  src_u_b, stride_u_b,

+                                                  width_uv, height_uv);

+  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,

+                                                  src_v_b, stride_v_b,

+                                                  width_uv, height_uv);

+  const uint64 samples = width * height + 2 * (width_uv * height_uv);

+  const uint64 sse = sse_y + sse_u + sse_v;

+  return SumSquareErrorToPsnr(sse, samples);

+}

+static const int64 cc1 =  26634;  // (64^2*(.01*255)^2

+static const int64 cc2 = 239708;  // (64^2*(.03*255)^2

+static double Ssim8x8_C(const uint8* src_a, int stride_a,

+                        const uint8* src_b, int stride_b) {

+  int64 sum_a = 0;

+  int64 sum_b = 0;

+  int64 sum_sq_a = 0;

+  int64 sum_sq_b = 0;

+  int64 sum_axb = 0;

+  int i;

+  for (i = 0; i < 8; ++i) {

+    int j;

+    for (j = 0; j < 8; ++j) {

+      sum_a += src_a[j];

+      sum_b += src_b[j];

+      sum_sq_a += src_a[j] * src_a[j];

+      sum_sq_b += src_b[j] * src_b[j];

+      sum_axb += src_a[j] * src_b[j];

+    }

+    src_a += stride_a;

+    src_b += stride_b;

+  }

+  {

+    const int64 count = 64;

+    // scale the constants by number of pixels

+    const int64 c1 = (cc1 * count * count) >> 12;

+    const int64 c2 = (cc2 * count * count) >> 12;

+    const int64 sum_a_x_sum_b = sum_a * sum_b;

+    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *

+                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);

+    const int64 sum_a_sq = sum_a*sum_a;

+    const int64 sum_b_sq = sum_b*sum_b;

+    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *

+                         (count * sum_sq_a - sum_a_sq +

+                          count * sum_sq_b - sum_b_sq + c2);

+    if (ssim_d == 0.0) {

+      return DBL_MAX;

+    }

+    return ssim_n * 1.0 / ssim_d;

+  }

+}

+// We are using a 8x8 moving window with starting location of each 8x8 window

+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap

+// block boundaries to penalize blocking artifacts.

+LIBYUV_API

+double CalcFrameSsim(const uint8* src_a, int stride_a,

+                     const uint8* src_b, int stride_b,

+                     int width, int height) {

+  int samples = 0;

+  double ssim_total = 0;

+  double (*Ssim8x8)(const uint8* src_a, int stride_a,

+                    const uint8* src_b, int stride_b) = Ssim8x8_C;

+  // sample point start with each 4x4 location

+  int i;

+  for (i = 0; i < height - 8; i += 4) {

+    int j;

+    for (j = 0; j < width - 8; j += 4) {

+      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);

+      samples++;

+    }

+    src_a += stride_a * 4;

+    src_b += stride_b * 4;

+  }

+  ssim_total /= samples;

+  return ssim_total;

+}

+LIBYUV_API

+double I420Ssim(const uint8* src_y_a, int stride_y_a,

+                const uint8* src_u_a, int stride_u_a,

+                const uint8* src_v_a, int stride_v_a,

+                const uint8* src_y_b, int stride_y_b,

+                const uint8* src_u_b, int stride_u_b,

+                const uint8* src_v_b, int stride_v_b,

+                int width, int height) {

+  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,

+                                      src_y_b, stride_y_b, width, height);

+  const int width_uv = (width + 1) >> 1;

+  const int height_uv = (height + 1) >> 1;

+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,

+                                      src_u_b, stride_u_b,

+                                      width_uv, height_uv);

+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,

+                                      src_v_b, stride_v_b,

+                                      width_uv, height_uv);

+  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/compare_common.cc

@@ -1,0 +1,42 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {

+  uint32 sse = 0u;

+  int i;

+  for (i = 0; i < count; ++i) {

+    int diff = src_a[i] - src_b[i];

+    sse += (uint32)(diff * diff);

+  }

+  return sse;

+}

+// hash seed of 5381 recommended.

+// Internal C version of HashDjb2 with int sized count for efficiency.

+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {

+  uint32 hash = seed;

+  int i;

+  for (i = 0; i < count; ++i) {

+    hash += (hash << 5) + src[i];

+  }

+  return hash;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/compare_neon.cc

@@ -1,0 +1,64 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/basic_types.h"

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)

+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {

+  volatile uint32 sse;

+  asm volatile (

+    "vmov.u8    q8, #0                         \n"

+    "vmov.u8    q10, #0                        \n"

+    "vmov.u8    q9, #0                         \n"

+    "vmov.u8    q11, #0                        \n"

+    ".p2align  2                               \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"

+    MEMACCESS(1)

+    "vld1.8     {q1}, [%1]!                    \n"

+    "subs       %2, %2, #16                    \n"

+    "vsubl.u8   q2, d0, d2                     \n"

+    "vsubl.u8   q3, d1, d3                     \n"

+    "vmlal.s16  q8, d4, d4                     \n"

+    "vmlal.s16  q9, d6, d6                     \n"

+    "vmlal.s16  q10, d5, d5                    \n"

+    "vmlal.s16  q11, d7, d7                    \n"

+    "bgt        1b                             \n"

+    "vadd.u32   q8, q8, q9                     \n"

+    "vadd.u32   q10, q10, q11                  \n"

+    "vadd.u32   q11, q8, q10                   \n"

+    "vpaddl.u32 q1, q11                        \n"

+    "vadd.u64   d0, d2, d3                     \n"

+    "vmov.32    %3, d0[0]                      \n"

+    : "+r"(src_a),

+      "+r"(src_b),

+      "+r"(count),

+      "=r"(sse)

+    :

+    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");

+  return sse;

+}

+#endif  // __ARM_NEON__

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/compare_posix.cc

@@ -1,0 +1,158 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/basic_types.h"

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))

+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {

+  uint32 sse;

+  asm volatile (  // NOLINT

+    "pxor      %%xmm0,%%xmm0                   \n"

+    "pxor      %%xmm5,%%xmm5                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x10, 0) ",%0          \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"

+    "lea       " MEMLEA(0x10, 1) ",%1          \n"

+    "sub       $0x10,%2                        \n"

+    "movdqa    %%xmm1,%%xmm3                   \n"

+    "psubusb   %%xmm2,%%xmm1                   \n"

+    "psubusb   %%xmm3,%%xmm2                   \n"

+    "por       %%xmm2,%%xmm1                   \n"

+    "movdqa    %%xmm1,%%xmm2                   \n"

+    "punpcklbw %%xmm5,%%xmm1                   \n"

+    "punpckhbw %%xmm5,%%xmm2                   \n"

+    "pmaddwd   %%xmm1,%%xmm1                   \n"

+    "pmaddwd   %%xmm2,%%xmm2                   \n"

+    "paddd     %%xmm1,%%xmm0                   \n"

+    "paddd     %%xmm2,%%xmm0                   \n"

+    "jg        1b                              \n"

+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"

+    "paddd     %%xmm1,%%xmm0                   \n"

+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"

+    "paddd     %%xmm1,%%xmm0                   \n"

+    "movd      %%xmm0,%3                       \n"

+  : "+r"(src_a),      // %0

+    "+r"(src_b),      // %1

+    "+r"(count),      // %2

+    "=g"(sse)         // %3

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );  // NOLINT

+  return sse;

+}

+#endif  // defined(__x86_64__) || defined(__i386__)

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))

+#define HAS_HASHDJB2_SSE41

+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16

+static uvec32 kHashMul0 = {

+  0x0c3525e1,  // 33 ^ 15

+  0xa3476dc1,  // 33 ^ 14

+  0x3b4039a1,  // 33 ^ 13

+  0x4f5f0981,  // 33 ^ 12

+};

+static uvec32 kHashMul1 = {

+  0x30f35d61,  // 33 ^ 11

+  0x855cb541,  // 33 ^ 10

+  0x040a9121,  // 33 ^ 9

+  0x747c7101,  // 33 ^ 8

+};

+static uvec32 kHashMul2 = {

+  0xec41d4e1,  // 33 ^ 7

+  0x4cfa3cc1,  // 33 ^ 6

+  0x025528a1,  // 33 ^ 5

+  0x00121881,  // 33 ^ 4

+};

+static uvec32 kHashMul3 = {

+  0x00008c61,  // 33 ^ 3

+  0x00000441,  // 33 ^ 2

+  0x00000021,  // 33 ^ 1

+  0x00000001,  // 33 ^ 0

+};

+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {

+  uint32 hash;

+  asm volatile (  // NOLINT

+    "movd      %2,%%xmm0                       \n"

+    "pxor      %%xmm7,%%xmm7                   \n"

+    "movdqa    %4,%%xmm6                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x10, 0) ",%0          \n"

+    "pmulld    %%xmm6,%%xmm0                   \n"

+    "movdqa    %5,%%xmm5                       \n"

+    "movdqa    %%xmm1,%%xmm2                   \n"

+    "punpcklbw %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm2,%%xmm3                   \n"

+    "punpcklwd %%xmm7,%%xmm3                   \n"

+    "pmulld    %%xmm5,%%xmm3                   \n"

+    "movdqa    %6,%%xmm5                       \n"

+    "movdqa    %%xmm2,%%xmm4                   \n"

+    "punpckhwd %%xmm7,%%xmm4                   \n"

+    "pmulld    %%xmm5,%%xmm4                   \n"

+    "movdqa    %7,%%xmm5                       \n"

+    "punpckhbw %%xmm7,%%xmm1                   \n"

+    "movdqa    %%xmm1,%%xmm2                   \n"

+    "punpcklwd %%xmm7,%%xmm2                   \n"

+    "pmulld    %%xmm5,%%xmm2                   \n"

+    "movdqa    %8,%%xmm5                       \n"

+    "punpckhwd %%xmm7,%%xmm1                   \n"

+    "pmulld    %%xmm5,%%xmm1                   \n"

+    "paddd     %%xmm4,%%xmm3                   \n"

+    "paddd     %%xmm2,%%xmm1                   \n"

+    "sub       $0x10,%1                        \n"

+    "paddd     %%xmm3,%%xmm1                   \n"

+    "pshufd    $0xe,%%xmm1,%%xmm2              \n"

+    "paddd     %%xmm2,%%xmm1                   \n"

+    "pshufd    $0x1,%%xmm1,%%xmm2              \n"

+    "paddd     %%xmm2,%%xmm1                   \n"

+    "paddd     %%xmm1,%%xmm0                   \n"

+    "jg        1b                              \n"

+    "movd      %%xmm0,%3                       \n"

+  : "+r"(src),        // %0

+    "+r"(count),      // %1

+    "+rm"(seed),      // %2

+    "=g"(hash)        // %3

+  : "m"(kHash16x33),  // %4

+    "m"(kHashMul0),   // %5

+    "m"(kHashMul1),   // %6

+    "m"(kHashMul2),   // %7

+    "m"(kHashMul3)    // %8

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );  // NOLINT

+  return hash;

+}

+#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/compare_win.cc

@@ -1,0 +1,232 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/basic_types.h"

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

+__declspec(naked) __declspec(align(16))

+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {

+  __asm {

+    mov        eax, [esp + 4]    // src_a

+    mov        edx, [esp + 8]    // src_b

+    mov        ecx, [esp + 12]   // count

+    pxor       xmm0, xmm0

+    pxor       xmm5, xmm5

+    align      4

+  wloop:

+    movdqa     xmm1, [eax]

+    lea        eax,  [eax + 16]

+    movdqa     xmm2, [edx]

+    lea        edx,  [edx + 16]

+    sub        ecx, 16

+    movdqa     xmm3, xmm1  // abs trick

+    psubusb    xmm1, xmm2

+    psubusb    xmm2, xmm3

+    por        xmm1, xmm2

+    movdqa     xmm2, xmm1

+    punpcklbw  xmm1, xmm5

+    punpckhbw  xmm2, xmm5

+    pmaddwd    xmm1, xmm1

+    pmaddwd    xmm2, xmm2

+    paddd      xmm0, xmm1

+    paddd      xmm0, xmm2

+    jg         wloop

+    pshufd     xmm1, xmm0, 0xee

+    paddd      xmm0, xmm1

+    pshufd     xmm1, xmm0, 0x01

+    paddd      xmm0, xmm1

+    movd       eax, xmm0

+    ret

+  }

+}

+// Visual C 2012 required for AVX2.

+#if _MSC_VER >= 1700

+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.

+#pragma warning(disable: 4752)

+__declspec(naked) __declspec(align(16))

+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {

+  __asm {

+    mov        eax, [esp + 4]    // src_a

+    mov        edx, [esp + 8]    // src_b

+    mov        ecx, [esp + 12]   // count

+    vpxor      ymm0, ymm0, ymm0  // sum

+    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck

+    sub        edx, eax

+    align      4

+  wloop:

+    vmovdqu    ymm1, [eax]

+    vmovdqu    ymm2, [eax + edx]

+    lea        eax,  [eax + 32]

+    sub        ecx, 32

+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick

+    vpsubusb   ymm2, ymm2, ymm1

+    vpor       ymm1, ymm2, ymm3

+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.

+    vpunpckhbw ymm1, ymm1, ymm5

+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.

+    vpmaddwd   ymm1, ymm1, ymm1

+    vpaddd     ymm0, ymm0, ymm1

+    vpaddd     ymm0, ymm0, ymm2

+    jg         wloop

+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.

+    vpaddd     ymm0, ymm0, ymm1

+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.

+    vpaddd     ymm0, ymm0, ymm1

+    vpermq     ymm1, ymm0, 0x02  // high + low lane.

+    vpaddd     ymm0, ymm0, ymm1

+    vmovd      eax, xmm0

+    vzeroupper

+    ret

+  }

+}

+#endif  // _MSC_VER >= 1700

+#define HAS_HASHDJB2_SSE41

+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16

+static uvec32 kHashMul0 = {

+  0x0c3525e1,  // 33 ^ 15

+  0xa3476dc1,  // 33 ^ 14

+  0x3b4039a1,  // 33 ^ 13

+  0x4f5f0981,  // 33 ^ 12

+};

+static uvec32 kHashMul1 = {

+  0x30f35d61,  // 33 ^ 11

+  0x855cb541,  // 33 ^ 10

+  0x040a9121,  // 33 ^ 9

+  0x747c7101,  // 33 ^ 8

+};

+static uvec32 kHashMul2 = {

+  0xec41d4e1,  // 33 ^ 7

+  0x4cfa3cc1,  // 33 ^ 6

+  0x025528a1,  // 33 ^ 5

+  0x00121881,  // 33 ^ 4

+};

+static uvec32 kHashMul3 = {

+  0x00008c61,  // 33 ^ 3

+  0x00000441,  // 33 ^ 2

+  0x00000021,  // 33 ^ 1

+  0x00000001,  // 33 ^ 0

+};

+// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6

+// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5

+// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5

+// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5

+// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5

+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \

+    _asm _emit 0x40 _asm _emit reg

+__declspec(naked) __declspec(align(16))

+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {

+  __asm {

+    mov        eax, [esp + 4]    // src

+    mov        ecx, [esp + 8]    // count

+    movd       xmm0, [esp + 12]  // seed

+    pxor       xmm7, xmm7        // constant 0 for unpck

+    movdqa     xmm6, kHash16x33

+    align      4

+  wloop:

+    movdqu     xmm1, [eax]       // src[0-15]

+    lea        eax, [eax + 16]

+    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16

+    movdqa     xmm5, kHashMul0

+    movdqa     xmm2, xmm1

+    punpcklbw  xmm2, xmm7        // src[0-7]

+    movdqa     xmm3, xmm2

+    punpcklwd  xmm3, xmm7        // src[0-3]

+    pmulld(0xdd)                 // pmulld     xmm3, xmm5

+    movdqa     xmm5, kHashMul1

+    movdqa     xmm4, xmm2

+    punpckhwd  xmm4, xmm7        // src[4-7]

+    pmulld(0xe5)                 // pmulld     xmm4, xmm5

+    movdqa     xmm5, kHashMul2

+    punpckhbw  xmm1, xmm7        // src[8-15]

+    movdqa     xmm2, xmm1

+    punpcklwd  xmm2, xmm7        // src[8-11]

+    pmulld(0xd5)                 // pmulld     xmm2, xmm5

+    movdqa     xmm5, kHashMul3

+    punpckhwd  xmm1, xmm7        // src[12-15]

+    pmulld(0xcd)                 // pmulld     xmm1, xmm5

+    paddd      xmm3, xmm4        // add 16 results

+    paddd      xmm1, xmm2

+    sub        ecx, 16

+    paddd      xmm1, xmm3

+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords

+    paddd      xmm1, xmm2

+    pshufd     xmm2, xmm1, 0x01

+    paddd      xmm1, xmm2

+    paddd      xmm0, xmm1

+    jg         wloop

+    movd       eax, xmm0         // return hash

+    ret

+  }

+}

+// Visual C 2012 required for AVX2.

+#if _MSC_VER >= 1700

+__declspec(naked) __declspec(align(16))

+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {

+  __asm {

+    mov        eax, [esp + 4]    // src

+    mov        ecx, [esp + 8]    // count

+    movd       xmm0, [esp + 12]  // seed

+    movdqa     xmm6, kHash16x33

+    align      4

+  wloop:

+    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]

+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16

+    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]

+    pmulld     xmm3, kHashMul0

+    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]

+    pmulld     xmm4, kHashMul1

+    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]

+    pmulld     xmm2, kHashMul2

+    lea        eax, [eax + 16]

+    pmulld     xmm1, kHashMul3

+    paddd      xmm3, xmm4        // add 16 results

+    paddd      xmm1, xmm2

+    sub        ecx, 16

+    paddd      xmm1, xmm3

+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords

+    paddd      xmm1, xmm2

+    pshufd     xmm2, xmm1, 0x01

+    paddd      xmm1, xmm2

+    paddd      xmm0, xmm1

+    jg         wloop

+    movd       eax, xmm0         // return hash

+    ret

+  }

+}

+#endif  // _MSC_VER >= 1700

+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/convert.cc

@@ -1,0 +1,1513 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/convert.h"

+#include "libyuv/basic_types.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/planar_functions.h"

+#include "libyuv/rotate.h"

+#include "libyuv/scale.h"  // For ScalePlane()

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)

+static __inline int Abs(int v) {

+  return v >= 0 ? v : -v;

+}

+// Any I4xx To I420 format with mirroring.

+static int I4xxToI420(const uint8* src_y, int src_stride_y,

+                      const uint8* src_u, int src_stride_u,

+                      const uint8* src_v, int src_stride_v,

+                      uint8* dst_y, int dst_stride_y,

+                      uint8* dst_u, int dst_stride_u,

+                      uint8* dst_v, int dst_stride_v,

+                      int src_y_width, int src_y_height,

+                      int src_uv_width, int src_uv_height) {

+  const int dst_y_width = Abs(src_y_width);

+  const int dst_y_height = Abs(src_y_height);

+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);

+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);

+  if (src_y_width == 0 || src_y_height == 0 ||

+      src_uv_width == 0 || src_uv_height == 0) {

+    return -1;

+  }

+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,

+             dst_y, dst_stride_y, dst_y_width, dst_y_height,

+             kFilterBilinear);

+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,

+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,

+             kFilterBilinear);

+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,

+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,

+             kFilterBilinear);

+  return 0;

+}

+// Copy I420 with optional flipping

+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure

+// is does row coalescing.

+LIBYUV_API

+int I420Copy(const uint8* src_y, int src_stride_y,

+             const uint8* src_u, int src_stride_u,

+             const uint8* src_v, int src_stride_v,

+             uint8* dst_y, int dst_stride_y,

+             uint8* dst_u, int dst_stride_u,

+             uint8* dst_v, int dst_stride_v,

+             int width, int height) {

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src_y || !src_u || !src_v ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_u = src_u + (halfheight - 1) * src_stride_u;

+    src_v = src_v + (halfheight - 1) * src_stride_v;

+    src_stride_y = -src_stride_y;

+    src_stride_u = -src_stride_u;

+    src_stride_v = -src_stride_v;

+  }

+  if (dst_y) {

+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  }

+  // Copy UV planes.

+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);

+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);

+  return 0;

+}

+// 422 chroma is 1/2 width, 1x height

+// 420 chroma is 1/2 width, 1/2 height

+LIBYUV_API

+int I422ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  const int src_uv_width = SUBSAMPLE(width, 1, 1);

+  return I4xxToI420(src_y, src_stride_y,

+                    src_u, src_stride_u,

+                    src_v, src_stride_v,

+                    dst_y, dst_stride_y,

+                    dst_u, dst_stride_u,

+                    dst_v, dst_stride_v,

+                    width, height,

+                    src_uv_width, height);

+}

+// 444 chroma is 1x width, 1x height

+// 420 chroma is 1/2 width, 1/2 height

+LIBYUV_API

+int I444ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  return I4xxToI420(src_y, src_stride_y,

+                    src_u, src_stride_u,

+                    src_v, src_stride_v,

+                    dst_y, dst_stride_y,

+                    dst_u, dst_stride_u,

+                    dst_v, dst_stride_v,

+                    width, height,

+                    width, height);

+}

+// 411 chroma is 1/4 width, 1x height

+// 420 chroma is 1/2 width, 1/2 height

+LIBYUV_API

+int I411ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  const int src_uv_width = SUBSAMPLE(width, 3, 2);

+  return I4xxToI420(src_y, src_stride_y,

+                    src_u, src_stride_u,

+                    src_v, src_stride_v,

+                    dst_y, dst_stride_y,

+                    dst_u, dst_stride_u,

+                    dst_v, dst_stride_v,

+                    width, height,

+                    src_uv_width, height);

+}

+// I400 is greyscale typically used in MJPG

+LIBYUV_API

+int I400ToI420(const uint8* src_y, int src_stride_y,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src_y || !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_stride_y = -src_stride_y;

+  }

+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);

+  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);

+  return 0;

+}

+static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,

+                       uint8* dst, int dst_stride,

+                       int width, int height) {

+  int y;

+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;

+#if defined(HAS_COPYROW_X86)

+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {

+    CopyRow = CopyRow_X86;

+  }

+#endif

+#if defined(HAS_COPYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&

+      IS_ALIGNED(src, 16) &&

+      IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&

+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {

+    CopyRow = CopyRow_SSE2;

+  }

+#endif

+#if defined(HAS_COPYROW_ERMS)

+  if (TestCpuFlag(kCpuHasERMS)) {

+    CopyRow = CopyRow_ERMS;

+  }

+#endif

+#if defined(HAS_COPYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {

+    CopyRow = CopyRow_NEON;

+  }

+#endif

+#if defined(HAS_COPYROW_MIPS)

+  if (TestCpuFlag(kCpuHasMIPS)) {

+    CopyRow = CopyRow_MIPS;

+  }

+#endif

+  // Copy plane

+  for (y = 0; y < height - 1; y += 2) {

+    CopyRow(src, dst, width);

+    CopyRow(src + src_stride_0, dst + dst_stride, width);

+    src += src_stride_0 + src_stride_1;

+    dst += dst_stride * 2;

+  }

+  if (height & 1) {

+    CopyRow(src, dst, width);

+  }

+}

+// Support converting from FOURCC_M420

+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for

+// easy conversion to I420.

+// M420 format description:

+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.

+// Chroma is half width / half height. (420)

+// src_stride_m420 is row planar. Normally this will be the width in pixels.

+//   The UV plane is half width, but 2 values, so src_stride_m420 applies to

+//   this as well as the two Y planes.

+static int X420ToI420(const uint8* src_y,

+                      int src_stride_y0, int src_stride_y1,

+                      const uint8* src_uv, int src_stride_uv,

+                      uint8* dst_y, int dst_stride_y,

+                      uint8* dst_u, int dst_stride_u,

+                      uint8* dst_v, int dst_stride_v,

+                      int width, int height) {

+  int y;

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =

+      SplitUVRow_C;

+  if (!src_y || !src_uv ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    dst_y = dst_y + (height - 1) * dst_stride_y;

+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;

+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;

+    dst_stride_y = -dst_stride_y;

+    dst_stride_u = -dst_stride_u;

+    dst_stride_v = -dst_stride_v;

+  }

+  // Coalesce rows.

+  if (src_stride_y0 == width &&

+      src_stride_y1 == width &&

+      dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;

+  }

+  // Coalesce rows.

+  if (src_stride_uv == halfwidth * 2 &&

+      dst_stride_u == halfwidth &&

+      dst_stride_v == halfwidth) {

+    halfwidth *= halfheight;

+    halfheight = 1;

+    src_stride_uv = dst_stride_u = dst_stride_v = 0;

+  }

+#if defined(HAS_SPLITUVROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {

+    SplitUVRow = SplitUVRow_Any_SSE2;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      SplitUVRow = SplitUVRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&

+          IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&

+          IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {

+        SplitUVRow = SplitUVRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_SPLITUVROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {

+    SplitUVRow = SplitUVRow_Any_AVX2;

+    if (IS_ALIGNED(halfwidth, 32)) {

+      SplitUVRow = SplitUVRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_SPLITUVROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {

+    SplitUVRow = SplitUVRow_Any_NEON;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      SplitUVRow = SplitUVRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_SPLITUVROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {

+    SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;

+      if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&

+          IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&

+          IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {

+        SplitUVRow = SplitUVRow_MIPS_DSPR2;

+      }

+    }

+  }

+#endif

+  if (dst_y) {

+    if (src_stride_y0 == src_stride_y1) {

+      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);

+    } else {

+      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,

+                 width, height);

+    }

+  }

+  for (y = 0; y < halfheight; ++y) {

+    // Copy a row of UV.

+    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+    src_uv += src_stride_uv;

+  }

+  return 0;

+}

+// Convert NV12 to I420.

+LIBYUV_API

+int NV12ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_uv, int src_stride_uv,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  return X420ToI420(src_y, src_stride_y, src_stride_y,

+                    src_uv, src_stride_uv,

+                    dst_y, dst_stride_y,

+                    dst_u, dst_stride_u,

+                    dst_v, dst_stride_v,

+                    width, height);

+}

+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.

+LIBYUV_API

+int NV21ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_vu, int src_stride_vu,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  return X420ToI420(src_y, src_stride_y, src_stride_y,

+                    src_vu, src_stride_vu,

+                    dst_y, dst_stride_y,

+                    dst_v, dst_stride_v,

+                    dst_u, dst_stride_u,

+                    width, height);

+}

+// Convert M420 to I420.

+LIBYUV_API

+int M420ToI420(const uint8* src_m420, int src_stride_m420,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,

+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,

+                    dst_y, dst_stride_y,

+                    dst_u, dst_stride_u,

+                    dst_v, dst_stride_v,

+                    width, height);

+}

+// Convert Q420 to I420.

+// Format is rows of YY/YUYV

+LIBYUV_API

+int Q420ToI420(const uint8* src_y, int src_stride_y,

+               const uint8* src_yuy2, int src_stride_yuy2,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  int halfheight = (height + 1) >> 1;

+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;

+  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,

+      int pix) = YUY2ToUV422Row_C;

+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =

+      YUY2ToYRow_C;

+  if (!src_y || !src_yuy2 ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    dst_y = dst_y + (height - 1) * dst_stride_y;

+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;

+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;

+    dst_stride_y = -dst_stride_y;

+    dst_stride_u = -dst_stride_u;

+    dst_stride_v = -dst_stride_v;

+  }

+  // CopyRow for rows of just Y in Q420 copied to Y plane of I420.

+#if defined(HAS_COPYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {

+    CopyRow = CopyRow_NEON;

+  }

+#endif

+#if defined(HAS_COPYROW_X86)

+  if (IS_ALIGNED(width, 4)) {

+    CopyRow = CopyRow_X86;

+  }

+#endif

+#if defined(HAS_COPYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&

+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&

+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+    CopyRow = CopyRow_SSE2;

+  }

+#endif

+#if defined(HAS_COPYROW_ERMS)

+  if (TestCpuFlag(kCpuHasERMS)) {

+    CopyRow = CopyRow_ERMS;

+  }

+#endif

+#if defined(HAS_COPYROW_MIPS)

+  if (TestCpuFlag(kCpuHasMIPS)) {

+    CopyRow = CopyRow_MIPS;

+  }

+#endif

+#if defined(HAS_YUY2TOYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;

+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;

+      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {

+        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          YUY2ToYRow = YUY2ToYRow_SSE2;

+        }

+      }

+    }

+  }

+#endif

+#if defined(HAS_YUY2TOYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {

+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;

+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;

+      YUY2ToYRow = YUY2ToYRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_YUY2TOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    YUY2ToYRow = YUY2ToYRow_Any_NEON;

+    if (width >= 16) {

+      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;

+    }

+    if (IS_ALIGNED(width, 16)) {

+      YUY2ToYRow = YUY2ToYRow_NEON;

+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    CopyRow(src_y, dst_y, width);

+    src_y += src_stride_y;

+    dst_y += dst_stride_y;

+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);

+    YUY2ToYRow(src_yuy2, dst_y, width);

+    src_yuy2 += src_stride_yuy2;

+    dst_y += dst_stride_y;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+    CopyRow(src_y, dst_y, width);

+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);

+  }

+  return 0;

+}

+// Convert YUY2 to I420.

+LIBYUV_API

+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,

+      uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;

+  void (*YUY2ToYRow)(const uint8* src_yuy2,

+      uint8* dst_y, int pix) = YUY2ToYRow_C;

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;

+    src_stride_yuy2 = -src_stride_yuy2;

+  }

+#if defined(HAS_YUY2TOYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;

+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;

+      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {

+        YUY2ToUVRow = YUY2ToUVRow_SSE2;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          YUY2ToYRow = YUY2ToYRow_SSE2;

+        }

+      }

+    }

+  }

+#endif

+#if defined(HAS_YUY2TOYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {

+    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;

+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      YUY2ToUVRow = YUY2ToUVRow_AVX2;

+      YUY2ToYRow = YUY2ToYRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_YUY2TOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    YUY2ToYRow = YUY2ToYRow_Any_NEON;

+    if (width >= 16) {

+      YUY2ToUVRow = YUY2ToUVRow_Any_NEON;

+    }

+    if (IS_ALIGNED(width, 16)) {

+      YUY2ToYRow = YUY2ToYRow_NEON;

+      YUY2ToUVRow = YUY2ToUVRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);

+    YUY2ToYRow(src_yuy2, dst_y, width);

+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);

+    src_yuy2 += src_stride_yuy2 * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);

+    YUY2ToYRow(src_yuy2, dst_y, width);

+  }

+  return 0;

+}

+// Convert UYVY to I420.

+LIBYUV_API

+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,

+      uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;

+  void (*UYVYToYRow)(const uint8* src_uyvy,

+      uint8* dst_y, int pix) = UYVYToYRow_C;

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;

+    src_stride_uyvy = -src_stride_uyvy;

+  }

+#if defined(HAS_UYVYTOYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;

+    UYVYToYRow = UYVYToYRow_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;

+      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {

+        UYVYToUVRow = UYVYToUVRow_SSE2;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          UYVYToYRow = UYVYToYRow_SSE2;

+        }

+      }

+    }

+  }

+#endif

+#if defined(HAS_UYVYTOYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {

+    UYVYToUVRow = UYVYToUVRow_Any_AVX2;

+    UYVYToYRow = UYVYToYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      UYVYToUVRow = UYVYToUVRow_AVX2;

+      UYVYToYRow = UYVYToYRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_UYVYTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    UYVYToYRow = UYVYToYRow_Any_NEON;

+    if (width >= 16) {

+      UYVYToUVRow = UYVYToUVRow_Any_NEON;

+    }

+    if (IS_ALIGNED(width, 16)) {

+      UYVYToYRow = UYVYToYRow_NEON;

+      UYVYToUVRow = UYVYToUVRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);

+    UYVYToYRow(src_uyvy, dst_y, width);

+    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);

+    src_uyvy += src_stride_uyvy * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);

+    UYVYToYRow(src_uyvy, dst_y, width);

+  }

+  return 0;

+}

+// Convert ARGB to I420.

+LIBYUV_API

+int ARGBToI420(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  if (!src_argb ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+        ARGBToUVRow = ARGBToUVRow_SSSE3;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          ARGBToYRow = ARGBToYRow_SSSE3;

+        }

+      }

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {

+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;

+    ARGBToYRow = ARGBToYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToUVRow = ARGBToUVRow_AVX2;

+      ARGBToYRow = ARGBToYRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYRow = ARGBToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYRow = ARGBToYRow_NEON;

+    }

+    if (width >= 16) {

+      ARGBToUVRow = ARGBToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ARGBToUVRow = ARGBToUVRow_NEON;

+      }

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);

+    ARGBToYRow(src_argb, dst_y, width);

+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);

+    src_argb += src_stride_argb * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);

+    ARGBToYRow(src_argb, dst_y, width);

+  }

+  return 0;

+}

+// Convert BGRA to I420.

+LIBYUV_API

+int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,

+      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;

+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =

+      BGRAToYRow_C;

+  if (!src_bgra ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;

+    src_stride_bgra = -src_stride_bgra;

+  }

+#if defined(HAS_BGRATOYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;

+    BGRAToYRow = BGRAToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;

+      BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {

+        BGRAToUVRow = BGRAToUVRow_SSSE3;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          BGRAToYRow = BGRAToYRow_SSSE3;

+        }

+      }

+    }

+  }

+#elif defined(HAS_BGRATOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    BGRAToYRow = BGRAToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      BGRAToYRow = BGRAToYRow_NEON;

+    }

+    if (width >= 16) {

+      BGRAToUVRow = BGRAToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        BGRAToUVRow = BGRAToUVRow_NEON;

+      }

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);

+    BGRAToYRow(src_bgra, dst_y, width);

+    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);

+    src_bgra += src_stride_bgra * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);

+    BGRAToYRow(src_bgra, dst_y, width);

+  }

+  return 0;

+}

+// Convert ABGR to I420.

+LIBYUV_API

+int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,

+      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;

+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =

+      ABGRToYRow_C;

+  if (!src_abgr ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;

+    src_stride_abgr = -src_stride_abgr;

+  }

+#if defined(HAS_ABGRTOYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;

+    ABGRToYRow = ABGRToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;

+      ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {

+        ABGRToUVRow = ABGRToUVRow_SSSE3;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          ABGRToYRow = ABGRToYRow_SSSE3;

+        }

+      }

+    }

+  }

+#elif defined(HAS_ABGRTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ABGRToYRow = ABGRToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ABGRToYRow = ABGRToYRow_NEON;

+    }

+    if (width >= 16) {

+      ABGRToUVRow = ABGRToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ABGRToUVRow = ABGRToUVRow_NEON;

+      }

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);

+    ABGRToYRow(src_abgr, dst_y, width);

+    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);

+    src_abgr += src_stride_abgr * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);

+    ABGRToYRow(src_abgr, dst_y, width);

+  }

+  return 0;

+}

+// Convert RGBA to I420.

+LIBYUV_API

+int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,

+      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;

+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =

+      RGBAToYRow_C;

+  if (!src_rgba ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;

+    src_stride_rgba = -src_stride_rgba;

+  }

+#if defined(HAS_RGBATOYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;

+    RGBAToYRow = RGBAToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;

+      RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {

+        RGBAToUVRow = RGBAToUVRow_SSSE3;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          RGBAToYRow = RGBAToYRow_SSSE3;

+        }

+      }

+    }

+  }

+#elif defined(HAS_RGBATOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    RGBAToYRow = RGBAToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      RGBAToYRow = RGBAToYRow_NEON;

+    }

+    if (width >= 16) {

+      RGBAToUVRow = RGBAToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        RGBAToUVRow = RGBAToUVRow_NEON;

+      }

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);

+    RGBAToYRow(src_rgba, dst_y, width);

+    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);

+    src_rgba += src_stride_rgba * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);

+    RGBAToYRow(src_rgba, dst_y, width);

+  }

+  return 0;

+}

+// Convert RGB24 to I420.

+LIBYUV_API

+int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,

+                uint8* dst_y, int dst_stride_y,

+                uint8* dst_u, int dst_stride_u,

+                uint8* dst_v, int dst_stride_v,

+                int width, int height) {

+  int y;

+#if defined(HAS_RGB24TOYROW_NEON)

+  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,

+      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;

+  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =

+      RGB24ToYRow_C;

+#else

+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+      RGB24ToARGBRow_C;

+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  // Allocate 2 rows of ARGB.

+  const int kRowSize = (width * 4 + 15) & ~15;

+  align_buffer_64(row, kRowSize * 2);

+#endif

+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;

+    src_stride_rgb24 = -src_stride_rgb24;

+  }

+#if defined(HAS_RGB24TOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    RGB24ToYRow = RGB24ToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      RGB24ToYRow = RGB24ToYRow_NEON;

+    }

+    if (width >= 16) {

+      RGB24ToUVRow = RGB24ToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        RGB24ToUVRow = RGB24ToUVRow_NEON;

+      }

+    }

+  }

+#else  // HAS_RGB24TOYROW_NEON

+#if defined(HAS_RGB24TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#endif  // HAS_ARGBTOUVROW_SSSE3

+#endif  // HAS_RGB24TOYROW_NEON

+  for (y = 0; y < height - 1; y += 2) {

+#if defined(HAS_RGB24TOYROW_NEON)

+    RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);

+    RGB24ToYRow(src_rgb24, dst_y, width);

+    RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);

+#else

+    RGB24ToARGBRow(src_rgb24, row, width);

+    RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);

+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);

+    ARGBToYRow(row, dst_y, width);

+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);

+#endif

+    src_rgb24 += src_stride_rgb24 * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+#if defined(HAS_RGB24TOYROW_NEON)

+    RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);

+    RGB24ToYRow(src_rgb24, dst_y, width);

+#else

+    RGB24ToARGBRow(src_rgb24, row, width);

+    ARGBToUVRow(row, 0, dst_u, dst_v, width);

+    ARGBToYRow(row, dst_y, width);

+#endif

+  }

+#if !defined(HAS_RGB24TOYROW_NEON)

+  free_aligned_buffer_64(row);

+#endif

+  return 0;

+}

+// Convert RAW to I420.

+LIBYUV_API

+int RAWToI420(const uint8* src_raw, int src_stride_raw,

+              uint8* dst_y, int dst_stride_y,

+              uint8* dst_u, int dst_stride_u,

+              uint8* dst_v, int dst_stride_v,

+              int width, int height) {

+  int y;

+#if defined(HAS_RAWTOYROW_NEON)

+  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,

+      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;

+  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =

+      RAWToYRow_C;

+#else

+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+      RAWToARGBRow_C;

+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  // Allocate 2 rows of ARGB.

+  const int kRowSize = (width * 4 + 15) & ~15;

+  align_buffer_64(row, kRowSize * 2);

+#endif

+  if (!src_raw || !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_raw = src_raw + (height - 1) * src_stride_raw;

+    src_stride_raw = -src_stride_raw;

+  }

+#if defined(HAS_RAWTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    RAWToYRow = RAWToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      RAWToYRow = RAWToYRow_NEON;

+    }

+    if (width >= 16) {

+      RAWToUVRow = RAWToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        RAWToUVRow = RAWToUVRow_NEON;

+      }

+    }

+  }

+#else  // HAS_RAWTOYROW_NEON

+#if defined(HAS_RAWTOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      RAWToARGBRow = RAWToARGBRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#endif  // HAS_ARGBTOUVROW_SSSE3

+#endif  // HAS_RAWTOYROW_NEON

+  for (y = 0; y < height - 1; y += 2) {

+#if defined(HAS_RAWTOYROW_NEON)

+    RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);

+    RAWToYRow(src_raw, dst_y, width);

+    RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);

+#else

+    RAWToARGBRow(src_raw, row, width);

+    RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);

+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);

+    ARGBToYRow(row, dst_y, width);

+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);

+#endif

+    src_raw += src_stride_raw * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+#if defined(HAS_RAWTOYROW_NEON)

+    RAWToUVRow(src_raw, 0, dst_u, dst_v, width);

+    RAWToYRow(src_raw, dst_y, width);

+#else

+    RAWToARGBRow(src_raw, row, width);

+    ARGBToUVRow(row, 0, dst_u, dst_v, width);

+    ARGBToYRow(row, dst_y, width);

+#endif

+  }

+#if !defined(HAS_RAWTOYROW_NEON)

+  free_aligned_buffer_64(row);

+#endif

+  return 0;

+}

+// Convert RGB565 to I420.

+LIBYUV_API

+int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,

+                 uint8* dst_y, int dst_stride_y,

+                 uint8* dst_u, int dst_stride_u,

+                 uint8* dst_v, int dst_stride_v,

+                 int width, int height) {

+  int y;

+#if defined(HAS_RGB565TOYROW_NEON)

+  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,

+      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;

+  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =

+      RGB565ToYRow_C;

+#else

+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+      RGB565ToARGBRow_C;

+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  // Allocate 2 rows of ARGB.

+  const int kRowSize = (width * 4 + 15) & ~15;

+  align_buffer_64(row, kRowSize * 2);

+#endif

+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;

+    src_stride_rgb565 = -src_stride_rgb565;

+  }

+#if defined(HAS_RGB565TOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    RGB565ToYRow = RGB565ToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      RGB565ToYRow = RGB565ToYRow_NEON;

+    }

+    if (width >= 16) {

+      RGB565ToUVRow = RGB565ToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        RGB565ToUVRow = RGB565ToUVRow_NEON;

+      }

+    }

+  }

+#else  // HAS_RGB565TOYROW_NEON

+#if defined(HAS_RGB565TOARGBROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {

+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#endif  // HAS_ARGBTOUVROW_SSSE3

+#endif  // HAS_RGB565TOYROW_NEON

+  for (y = 0; y < height - 1; y += 2) {

+#if defined(HAS_RGB565TOYROW_NEON)

+    RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);

+    RGB565ToYRow(src_rgb565, dst_y, width);

+    RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);

+#else

+    RGB565ToARGBRow(src_rgb565, row, width);

+    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);

+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);

+    ARGBToYRow(row, dst_y, width);

+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);

+#endif

+    src_rgb565 += src_stride_rgb565 * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+#if defined(HAS_RGB565TOYROW_NEON)

+    RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);

+    RGB565ToYRow(src_rgb565, dst_y, width);

+#else

+    RGB565ToARGBRow(src_rgb565, row, width);

+    ARGBToUVRow(row, 0, dst_u, dst_v, width);

+    ARGBToYRow(row, dst_y, width);

+#endif

+  }

+#if !defined(HAS_RGB565TOYROW_NEON)

+  free_aligned_buffer_64(row);

+#endif

+  return 0;

+}

+// Convert ARGB1555 to I420.

+LIBYUV_API

+int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,

+                   uint8* dst_y, int dst_stride_y,

+                   uint8* dst_u, int dst_stride_u,

+                   uint8* dst_v, int dst_stride_v,

+                   int width, int height) {

+  int y;

+#if defined(HAS_ARGB1555TOYROW_NEON)

+  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,

+      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;

+  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =

+      ARGB1555ToYRow_C;

+#else

+  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+      ARGB1555ToARGBRow_C;

+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  // Allocate 2 rows of ARGB.

+  const int kRowSize = (width * 4 + 15) & ~15;

+  align_buffer_64(row, kRowSize * 2);

+#endif

+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;

+    src_stride_argb1555 = -src_stride_argb1555;

+  }

+#if defined(HAS_ARGB1555TOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGB1555ToYRow = ARGB1555ToYRow_NEON;

+    }

+    if (width >= 16) {

+      ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;

+      }

+    }

+  }

+#else  // HAS_ARGB1555TOYROW_NEON

+#if defined(HAS_ARGB1555TOARGBROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {

+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#endif  // HAS_ARGBTOUVROW_SSSE3

+#endif  // HAS_ARGB1555TOYROW_NEON

+  for (y = 0; y < height - 1; y += 2) {

+#if defined(HAS_ARGB1555TOYROW_NEON)

+    ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);

+    ARGB1555ToYRow(src_argb1555, dst_y, width);

+    ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,

+                   width);

+#else

+    ARGB1555ToARGBRow(src_argb1555, row, width);

+    ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,

+                      width);

+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);

+    ARGBToYRow(row, dst_y, width);

+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);

+#endif

+    src_argb1555 += src_stride_argb1555 * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+#if defined(HAS_ARGB1555TOYROW_NEON)

+    ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);

+    ARGB1555ToYRow(src_argb1555, dst_y, width);

+#else

+    ARGB1555ToARGBRow(src_argb1555, row, width);

+    ARGBToUVRow(row, 0, dst_u, dst_v, width);

+    ARGBToYRow(row, dst_y, width);

+#endif

+  }

+#if !defined(HAS_ARGB1555TOYROW_NEON)

+  free_aligned_buffer_64(row);

+#endif

+  return 0;

+}

+// Convert ARGB4444 to I420.

+LIBYUV_API

+int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,

+                   uint8* dst_y, int dst_stride_y,

+                   uint8* dst_u, int dst_stride_u,

+                   uint8* dst_v, int dst_stride_v,

+                   int width, int height) {

+  int y;

+#if defined(HAS_ARGB4444TOYROW_NEON)

+  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,

+      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;

+  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =

+      ARGB4444ToYRow_C;

+#else

+  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+      ARGB4444ToARGBRow_C;

+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  // Allocate 2 rows of ARGB.

+  const int kRowSize = (width * 4 + 15) & ~15;

+  align_buffer_64(row, kRowSize * 2);

+#endif

+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;

+    src_stride_argb4444 = -src_stride_argb4444;

+  }

+#if defined(HAS_ARGB4444TOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGB4444ToYRow = ARGB4444ToYRow_NEON;

+    }

+    if (width >= 16) {

+      ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;

+      }

+    }

+  }

+#else  // HAS_ARGB4444TOYROW_NEON

+#if defined(HAS_ARGB4444TOARGBROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {

+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#endif  // HAS_ARGBTOUVROW_SSSE3

+#endif  // HAS_ARGB4444TOYROW_NEON

+  for (y = 0; y < height - 1; y += 2) {

+#if defined(HAS_ARGB4444TOYROW_NEON)

+    ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);

+    ARGB4444ToYRow(src_argb4444, dst_y, width);

+    ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,

+                   width);

+#else

+    ARGB4444ToARGBRow(src_argb4444, row, width);

+    ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,

+                      width);

+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);

+    ARGBToYRow(row, dst_y, width);

+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);

+#endif

+    src_argb4444 += src_stride_argb4444 * 2;

+    dst_y += dst_stride_y * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+#if defined(HAS_ARGB4444TOYROW_NEON)

+    ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);

+    ARGB4444ToYRow(src_argb4444, dst_y, width);

+#else

+    ARGB4444ToARGBRow(src_argb4444, row, width);

+    ARGBToUVRow(row, 0, dst_u, dst_v, width);

+    ARGBToYRow(row, dst_y, width);

+#endif

+  }

+#if !defined(HAS_ARGB4444TOYROW_NEON)

+  free_aligned_buffer_64(row);

+#endif

+  return 0;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/convert_argb.cc

@@ -1,0 +1,938 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/convert_argb.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/format_conversion.h"

+#ifdef HAVE_JPEG

+#include "libyuv/mjpeg_decoder.h"

+#endif

+#include "libyuv/rotate_argb.h"

+#include "libyuv/row.h"

+#include "libyuv/video_common.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Copy ARGB with optional flipping

+LIBYUV_API

+int ARGBCopy(const uint8* src_argb, int src_stride_argb,

+             uint8* dst_argb, int dst_stride_argb,

+             int width, int height) {

+  if (!src_argb || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,

+            width * 4, height);

+  return 0;

+}

+// Convert I444 to ARGB.

+LIBYUV_API

+int I444ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*I444ToARGBRow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I444ToARGBRow_C;

+  if (!src_y || !src_u || !src_v ||

+      !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      src_stride_u == width &&

+      src_stride_v == width &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;

+  }

+#if defined(HAS_I444TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        I444ToARGBRow = I444ToARGBRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_I444TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I444ToARGBRow = I444ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I444ToARGBRow = I444ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);

+    dst_argb += dst_stride_argb;

+    src_y += src_stride_y;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+  }

+  return 0;

+}

+// Convert I422 to ARGB.

+LIBYUV_API

+int I422ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*I422ToARGBRow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I422ToARGBRow_C;

+  if (!src_y || !src_u || !src_v ||

+      !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      src_stride_u * 2 == width &&

+      src_stride_v * 2 == width &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;

+  }

+#if defined(HAS_I422TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        I422ToARGBRow = I422ToARGBRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {

+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToARGBRow = I422ToARGBRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToARGBRow = I422ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);

+    dst_argb += dst_stride_argb;

+    src_y += src_stride_y;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+  }

+  return 0;

+}

+// Convert I411 to ARGB.

+LIBYUV_API

+int I411ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*I411ToARGBRow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I411ToARGBRow_C;

+  if (!src_y || !src_u || !src_v ||

+      !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      src_stride_u * 4 == width &&

+      src_stride_v * 4 == width &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;

+  }

+#if defined(HAS_I411TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        I411ToARGBRow = I411ToARGBRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_I411TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I411ToARGBRow = I411ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I411ToARGBRow = I411ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);

+    dst_argb += dst_stride_argb;

+    src_y += src_stride_y;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+  }

+  return 0;

+}

+// Convert I400 to ARGB.

+LIBYUV_API

+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,

+                         uint8* dst_argb, int dst_stride_argb,

+                         int width, int height) {

+  int y;

+  void (*YToARGBRow)(const uint8* y_buf,

+                     uint8* rgb_buf,

+                     int width) = YToARGBRow_C;

+  if (!src_y || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_y = dst_stride_argb = 0;

+  }

+#if defined(HAS_YTOARGBROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    YToARGBRow = YToARGBRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      YToARGBRow = YToARGBRow_SSE2;

+    }

+  }

+#elif defined(HAS_YTOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    YToARGBRow = YToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      YToARGBRow = YToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    YToARGBRow(src_y, dst_argb, width);

+    dst_argb += dst_stride_argb;

+    src_y += src_stride_y;

+  }

+  return 0;

+}

+// Convert I400 to ARGB.

+LIBYUV_API

+int I400ToARGB(const uint8* src_y, int src_stride_y,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =

+      I400ToARGBRow_C;

+  if (!src_y || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_stride_y = -src_stride_y;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_y = dst_stride_argb = 0;

+  }

+#if defined(HAS_I400TOARGBROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {

+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        I400ToARGBRow = I400ToARGBRow_SSE2;

+      }

+    }

+  }

+#elif defined(HAS_I400TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I400ToARGBRow = I400ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I400ToARGBRow = I400ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I400ToARGBRow(src_y, dst_argb, width);

+    src_y += src_stride_y;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Shuffle table for converting BGRA to ARGB.

+static uvec8 kShuffleMaskBGRAToARGB = {

+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u

+};

+// Shuffle table for converting ABGR to ARGB.

+static uvec8 kShuffleMaskABGRToARGB = {

+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u

+};

+// Shuffle table for converting RGBA to ARGB.

+static uvec8 kShuffleMaskRGBAToARGB = {

+  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u

+};

+// Convert BGRA to ARGB.

+LIBYUV_API

+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return ARGBShuffle(src_bgra, src_stride_bgra,

+                     dst_argb, dst_stride_argb,

+                     (const uint8*)(&kShuffleMaskBGRAToARGB),

+                     width, height);

+}

+// Convert ARGB to BGRA (same as BGRAToARGB).

+LIBYUV_API

+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return ARGBShuffle(src_bgra, src_stride_bgra,

+                     dst_argb, dst_stride_argb,

+                     (const uint8*)(&kShuffleMaskBGRAToARGB),

+                     width, height);

+}

+// Convert ABGR to ARGB.

+LIBYUV_API

+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return ARGBShuffle(src_abgr, src_stride_abgr,

+                     dst_argb, dst_stride_argb,

+                     (const uint8*)(&kShuffleMaskABGRToARGB),

+                     width, height);

+}

+// Convert ARGB to ABGR to (same as ABGRToARGB).

+LIBYUV_API

+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return ARGBShuffle(src_abgr, src_stride_abgr,

+                     dst_argb, dst_stride_argb,

+                     (const uint8*)(&kShuffleMaskABGRToARGB),

+                     width, height);

+}

+// Convert RGBA to ARGB.

+LIBYUV_API

+int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  return ARGBShuffle(src_rgba, src_stride_rgba,

+                     dst_argb, dst_stride_argb,

+                     (const uint8*)(&kShuffleMaskRGBAToARGB),

+                     width, height);

+}

+// Convert RGB24 to ARGB.

+LIBYUV_API

+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,

+                uint8* dst_argb, int dst_stride_argb,

+                int width, int height) {

+  int y;

+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+      RGB24ToARGBRow_C;

+  if (!src_rgb24 || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;

+    src_stride_rgb24 = -src_stride_rgb24;

+  }

+  // Coalesce rows.

+  if (src_stride_rgb24 == width * 3 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_rgb24 = dst_stride_argb = 0;

+  }

+#if defined(HAS_RGB24TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;

+    }

+  }

+#elif defined(HAS_RGB24TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    RGB24ToARGBRow(src_rgb24, dst_argb, width);

+    src_rgb24 += src_stride_rgb24;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Convert RAW to ARGB.

+LIBYUV_API

+int RAWToARGB(const uint8* src_raw, int src_stride_raw,

+              uint8* dst_argb, int dst_stride_argb,

+              int width, int height) {

+  int y;

+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =

+      RAWToARGBRow_C;

+  if (!src_raw || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_raw = src_raw + (height - 1) * src_stride_raw;

+    src_stride_raw = -src_stride_raw;

+  }

+  // Coalesce rows.

+  if (src_stride_raw == width * 3 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_raw = dst_stride_argb = 0;

+  }

+#if defined(HAS_RAWTOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      RAWToARGBRow = RAWToARGBRow_SSSE3;

+    }

+  }

+#elif defined(HAS_RAWTOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    RAWToARGBRow = RAWToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      RAWToARGBRow = RAWToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    RAWToARGBRow(src_raw, dst_argb, width);

+    src_raw += src_stride_raw;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Convert RGB565 to ARGB.

+LIBYUV_API

+int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,

+                 uint8* dst_argb, int dst_stride_argb,

+                 int width, int height) {

+  int y;

+  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =

+      RGB565ToARGBRow_C;

+  if (!src_rgb565 || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;

+    src_stride_rgb565 = -src_stride_rgb565;

+  }

+  // Coalesce rows.

+  if (src_stride_rgb565 == width * 2 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_rgb565 = dst_stride_argb = 0;

+  }

+#if defined(HAS_RGB565TOARGBROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;

+    }

+  }

+#elif defined(HAS_RGB565TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    RGB565ToARGBRow(src_rgb565, dst_argb, width);

+    src_rgb565 += src_stride_rgb565;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Convert ARGB1555 to ARGB.

+LIBYUV_API

+int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,

+                   uint8* dst_argb, int dst_stride_argb,

+                   int width, int height) {

+  int y;

+  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,

+      int pix) = ARGB1555ToARGBRow_C;

+  if (!src_argb1555 || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;

+    src_stride_argb1555 = -src_stride_argb1555;

+  }

+  // Coalesce rows.

+  if (src_stride_argb1555 == width * 2 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb1555 = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGB1555TOARGBROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;

+    }

+  }

+#elif defined(HAS_ARGB1555TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);

+    src_argb1555 += src_stride_argb1555;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Convert ARGB4444 to ARGB.

+LIBYUV_API

+int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,

+                   uint8* dst_argb, int dst_stride_argb,

+                   int width, int height) {

+  int y;

+  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,

+      int pix) = ARGB4444ToARGBRow_C;

+  if (!src_argb4444 || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;

+    src_stride_argb4444 = -src_stride_argb4444;

+  }

+  // Coalesce rows.

+  if (src_stride_argb4444 == width * 2 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb4444 = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGB4444TOARGBROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;

+    }

+  }

+#elif defined(HAS_ARGB4444TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);

+    src_argb4444 += src_stride_argb4444;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Convert NV12 to ARGB.

+LIBYUV_API

+int NV12ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_uv, int src_stride_uv,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*NV12ToARGBRow)(const uint8* y_buf,

+                        const uint8* uv_buf,

+                        uint8* rgb_buf,

+                        int width) = NV12ToARGBRow_C;

+  if (!src_y || !src_uv || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+#if defined(HAS_NV12TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        NV12ToARGBRow = NV12ToARGBRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_NV12TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      NV12ToARGBRow = NV12ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    NV12ToARGBRow(src_y, src_uv, dst_argb, width);

+    dst_argb += dst_stride_argb;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_uv += src_stride_uv;

+    }

+  }

+  return 0;

+}

+// Convert NV21 to ARGB.

+LIBYUV_API

+int NV21ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_uv, int src_stride_uv,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*NV21ToARGBRow)(const uint8* y_buf,

+                        const uint8* uv_buf,

+                        uint8* rgb_buf,

+                        int width) = NV21ToARGBRow_C;

+  if (!src_y || !src_uv || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+#if defined(HAS_NV21TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        NV21ToARGBRow = NV21ToARGBRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_NV21TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      NV21ToARGBRow = NV21ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    NV21ToARGBRow(src_y, src_uv, dst_argb, width);

+    dst_argb += dst_stride_argb;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_uv += src_stride_uv;

+    }

+  }

+  return 0;

+}

+// Convert M420 to ARGB.

+LIBYUV_API

+int M420ToARGB(const uint8* src_m420, int src_stride_m420,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*NV12ToARGBRow)(const uint8* y_buf,

+                        const uint8* uv_buf,

+                        uint8* rgb_buf,

+                        int width) = NV12ToARGBRow_C;

+  if (!src_m420 || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+#if defined(HAS_NV12TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        NV12ToARGBRow = NV12ToARGBRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_NV12TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      NV12ToARGBRow = NV12ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);

+    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,

+                  dst_argb + dst_stride_argb, width);

+    dst_argb += dst_stride_argb * 2;

+    src_m420 += src_stride_m420 * 3;

+  }

+  if (height & 1) {

+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);

+  }

+  return 0;

+}

+// Convert YUY2 to ARGB.

+LIBYUV_API

+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =

+      YUY2ToARGBRow_C;

+  if (!src_yuy2 || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;

+    src_stride_yuy2 = -src_stride_yuy2;

+  }

+  // Coalesce rows.

+  if (src_stride_yuy2 == width * 2 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_yuy2 = dst_stride_argb = 0;

+  }

+#if defined(HAS_YUY2TOARGBROW_SSSE3)

+  // Posix is 16, Windows is 8.

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&

+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_YUY2TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      YUY2ToARGBRow = YUY2ToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    YUY2ToARGBRow(src_yuy2, dst_argb, width);

+    src_yuy2 += src_stride_yuy2;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Convert UYVY to ARGB.

+LIBYUV_API

+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =

+      UYVYToARGBRow_C;

+  if (!src_uyvy || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;

+    src_stride_uyvy = -src_stride_uyvy;

+  }

+  // Coalesce rows.

+  if (src_stride_uyvy == width * 2 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_uyvy = dst_stride_argb = 0;

+  }

+#if defined(HAS_UYVYTOARGBROW_SSSE3)

+  // Posix is 16, Windows is 8.

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&

+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        UYVYToARGBRow = UYVYToARGBRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_UYVYTOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      UYVYToARGBRow = UYVYToARGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    UYVYToARGBRow(src_uyvy, dst_argb, width);

+    src_uyvy += src_stride_uyvy;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/convert_from.cc

@@ -1,0 +1,1210 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/convert_from.h"

+#include "libyuv/basic_types.h"

+#include "libyuv/convert.h"  // For I420Copy

+#include "libyuv/cpu_id.h"

+#include "libyuv/format_conversion.h"

+#include "libyuv/planar_functions.h"

+#include "libyuv/rotate.h"

+#include "libyuv/scale.h"  // For ScalePlane()

+#include "libyuv/video_common.h"

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)

+static __inline int Abs(int v) {

+  return v >= 0 ? v : -v;

+}

+// I420 To any I4xx YUV format with mirroring.

+static int I420ToI4xx(const uint8* src_y, int src_stride_y,

+                      const uint8* src_u, int src_stride_u,

+                      const uint8* src_v, int src_stride_v,

+                      uint8* dst_y, int dst_stride_y,

+                      uint8* dst_u, int dst_stride_u,

+                      uint8* dst_v, int dst_stride_v,

+                      int src_y_width, int src_y_height,

+                      int dst_uv_width, int dst_uv_height) {

+  const int dst_y_width = Abs(src_y_width);

+  const int dst_y_height = Abs(src_y_height);

+  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);

+  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);

+  if (src_y_width == 0 || src_y_height == 0 ||

+      dst_uv_width <= 0 || dst_uv_height <= 0) {

+    return -1;

+  }

+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,

+             dst_y, dst_stride_y, dst_y_width, dst_y_height,

+             kFilterBilinear);

+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,

+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,

+             kFilterBilinear);

+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,

+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,

+             kFilterBilinear);

+  return 0;

+}

+// 420 chroma is 1/2 width, 1/2 height

+// 422 chroma is 1/2 width, 1x height

+LIBYUV_API

+int I420ToI422(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  const int dst_uv_width = (Abs(width) + 1) >> 1;

+  const int dst_uv_height = Abs(height);

+  return I420ToI4xx(src_y, src_stride_y,

+                    src_u, src_stride_u,

+                    src_v, src_stride_v,

+                    dst_y, dst_stride_y,

+                    dst_u, dst_stride_u,

+                    dst_v, dst_stride_v,

+                    width, height,

+                    dst_uv_width, dst_uv_height);

+}

+// 420 chroma is 1/2 width, 1/2 height

+// 444 chroma is 1x width, 1x height

+LIBYUV_API

+int I420ToI444(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  const int dst_uv_width = Abs(width);

+  const int dst_uv_height = Abs(height);

+  return I420ToI4xx(src_y, src_stride_y,

+                    src_u, src_stride_u,

+                    src_v, src_stride_v,

+                    dst_y, dst_stride_y,

+                    dst_u, dst_stride_u,

+                    dst_v, dst_stride_v,

+                    width, height,

+                    dst_uv_width, dst_uv_height);

+}

+// 420 chroma is 1/2 width, 1/2 height

+// 411 chroma is 1/4 width, 1x height

+LIBYUV_API

+int I420ToI411(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  const int dst_uv_width = (Abs(width) + 3) >> 2;

+  const int dst_uv_height = Abs(height);

+  return I420ToI4xx(src_y, src_stride_y,

+                    src_u, src_stride_u,

+                    src_v, src_stride_v,

+                    dst_y, dst_stride_y,

+                    dst_u, dst_stride_u,

+                    dst_v, dst_stride_v,

+                    width, height,

+                    dst_uv_width, dst_uv_height);

+}

+// Copy to I400. Source can be I420,422,444,400,NV12,NV21

+LIBYUV_API

+int I400Copy(const uint8* src_y, int src_stride_y,

+             uint8* dst_y, int dst_stride_y,

+             int width, int height) {

+  if (!src_y || !dst_y ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_stride_y = -src_stride_y;

+  }

+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  return 0;

+}

+LIBYUV_API

+int I422ToYUY2(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_yuy2, int dst_stride_yuy2,

+               int width, int height) {

+  int y;

+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,

+                        const uint8* src_v, uint8* dst_yuy2, int width) =

+      I422ToYUY2Row_C;

+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;

+    dst_stride_yuy2 = -dst_stride_yuy2;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      src_stride_u * 2 == width &&

+      src_stride_v * 2 == width &&

+      dst_stride_yuy2 == width * 2) {

+    width *= height;

+    height = 1;

+    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;

+  }

+#if defined(HAS_I422TOYUY2ROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToYUY2Row = I422ToYUY2Row_SSE2;

+    }

+  }

+#elif defined(HAS_I422TOYUY2ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToYUY2Row = I422ToYUY2Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);

+    src_y += src_stride_y;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+    dst_yuy2 += dst_stride_yuy2;

+  }

+  return 0;

+}

+LIBYUV_API

+int I420ToYUY2(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_yuy2, int dst_stride_yuy2,

+               int width, int height) {

+  int y;

+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,

+                        const uint8* src_v, uint8* dst_yuy2, int width) =

+      I422ToYUY2Row_C;

+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;

+    dst_stride_yuy2 = -dst_stride_yuy2;

+  }

+#if defined(HAS_I422TOYUY2ROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToYUY2Row = I422ToYUY2Row_SSE2;

+    }

+  }

+#elif defined(HAS_I422TOYUY2ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToYUY2Row = I422ToYUY2Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);

+    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,

+                  dst_yuy2 + dst_stride_yuy2, width);

+    src_y += src_stride_y * 2;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+    dst_yuy2 += dst_stride_yuy2 * 2;

+  }

+  if (height & 1) {

+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);

+  }

+  return 0;

+}

+LIBYUV_API

+int I422ToUYVY(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_uyvy, int dst_stride_uyvy,

+               int width, int height) {

+  int y;

+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,

+                        const uint8* src_v, uint8* dst_uyvy, int width) =

+      I422ToUYVYRow_C;

+  if (!src_y || !src_u || !src_v || !dst_uyvy ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;

+    dst_stride_uyvy = -dst_stride_uyvy;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      src_stride_u * 2 == width &&

+      src_stride_v * 2 == width &&

+      dst_stride_uyvy == width * 2) {

+    width *= height;

+    height = 1;

+    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;

+  }

+#if defined(HAS_I422TOUYVYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToUYVYRow = I422ToUYVYRow_SSE2;

+    }

+  }

+#elif defined(HAS_I422TOUYVYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToUYVYRow = I422ToUYVYRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);

+    src_y += src_stride_y;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+    dst_uyvy += dst_stride_uyvy;

+  }

+  return 0;

+}

+LIBYUV_API

+int I420ToUYVY(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_uyvy, int dst_stride_uyvy,

+               int width, int height) {

+  int y;

+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,

+                        const uint8* src_v, uint8* dst_uyvy, int width) =

+      I422ToUYVYRow_C;

+  if (!src_y || !src_u || !src_v || !dst_uyvy ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;

+    dst_stride_uyvy = -dst_stride_uyvy;

+  }

+#if defined(HAS_I422TOUYVYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToUYVYRow = I422ToUYVYRow_SSE2;

+    }

+  }

+#elif defined(HAS_I422TOUYVYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToUYVYRow = I422ToUYVYRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);

+    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,

+                  dst_uyvy + dst_stride_uyvy, width);

+    src_y += src_stride_y * 2;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+    dst_uyvy += dst_stride_uyvy * 2;

+  }

+  if (height & 1) {

+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);

+  }

+  return 0;

+}

+LIBYUV_API

+int I420ToNV12(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_uv, int dst_stride_uv,

+               int width, int height) {

+  int y;

+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+      int width) = MergeUVRow_C;

+  // Coalesce rows.

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    dst_y = dst_y + (height - 1) * dst_stride_y;

+    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;

+    dst_stride_y = -dst_stride_y;

+    dst_stride_uv = -dst_stride_uv;

+  }

+  if (src_stride_y == width &&

+      dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    src_stride_y = dst_stride_y = 0;

+  }

+  // Coalesce rows.

+  if (src_stride_u == halfwidth &&

+      src_stride_v == halfwidth &&

+      dst_stride_uv == halfwidth * 2) {

+    halfwidth *= halfheight;

+    halfheight = 1;

+    src_stride_u = src_stride_v = dst_stride_uv = 0;

+  }

+#if defined(HAS_MERGEUVROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {

+    MergeUVRow_ = MergeUVRow_Any_SSE2;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&

+          IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&

+          IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {

+        MergeUVRow_ = MergeUVRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_MERGEUVROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {

+    MergeUVRow_ = MergeUVRow_Any_AVX2;

+    if (IS_ALIGNED(halfwidth, 32)) {

+      MergeUVRow_ = MergeUVRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_MERGEUVROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {

+    MergeUVRow_ = MergeUVRow_Any_NEON;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      MergeUVRow_ = MergeUVRow_NEON;

+    }

+  }

+#endif

+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  for (y = 0; y < halfheight; ++y) {

+    // Merge a row of U and V into a row of UV.

+    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+    dst_uv += dst_stride_uv;

+  }

+  return 0;

+}

+LIBYUV_API

+int I420ToNV21(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_vu, int dst_stride_vu,

+               int width, int height) {

+  return I420ToNV12(src_y, src_stride_y,

+                    src_v, src_stride_v,

+                    src_u, src_stride_u,

+                    dst_y, src_stride_y,

+                    dst_vu, dst_stride_vu,

+                    width, height);

+}

+// Convert I420 to ARGB.

+LIBYUV_API

+int I420ToARGB(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*I422ToARGBRow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I422ToARGBRow_C;

+  if (!src_y || !src_u || !src_v || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+#if defined(HAS_I422TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        I422ToARGBRow = I422ToARGBRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {

+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToARGBRow = I422ToARGBRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToARGBRow = I422ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);

+    dst_argb += dst_stride_argb;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 to BGRA.

+LIBYUV_API

+int I420ToBGRA(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_bgra, int dst_stride_bgra,

+               int width, int height) {

+  int y;

+  void (*I422ToBGRARow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I422ToBGRARow_C;

+  if (!src_y || !src_u || !src_v || !dst_bgra ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;

+    dst_stride_bgra = -dst_stride_bgra;

+  }

+#if defined(HAS_I422TOBGRAROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {

+        I422ToBGRARow = I422ToBGRARow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_I422TOBGRAROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToBGRARow = I422ToBGRARow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToBGRARow = I422ToBGRARow_NEON;

+    }

+  }

+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {

+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);

+    dst_bgra += dst_stride_bgra;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 to ABGR.

+LIBYUV_API

+int I420ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height) {

+  int y;

+  void (*I422ToABGRRow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I422ToABGRRow_C;

+  if (!src_y || !src_u || !src_v || !dst_abgr ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;

+    dst_stride_abgr = -dst_stride_abgr;

+  }

+#if defined(HAS_I422TOABGRROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {

+        I422ToABGRRow = I422ToABGRRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_I422TOABGRROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToABGRRow = I422ToABGRRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToABGRRow = I422ToABGRRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);

+    dst_abgr += dst_stride_abgr;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 to RGBA.

+LIBYUV_API

+int I420ToRGBA(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_rgba, int dst_stride_rgba,

+               int width, int height) {

+  int y;

+  void (*I422ToRGBARow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I422ToRGBARow_C;

+  if (!src_y || !src_u || !src_v || !dst_rgba ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;

+    dst_stride_rgba = -dst_stride_rgba;

+  }

+#if defined(HAS_I422TORGBAROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {

+        I422ToRGBARow = I422ToRGBARow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_I422TORGBAROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToRGBARow = I422ToRGBARow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGBARow = I422ToRGBARow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);

+    dst_rgba += dst_stride_rgba;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 to RGB24.

+LIBYUV_API

+int I420ToRGB24(const uint8* src_y, int src_stride_y,

+                const uint8* src_u, int src_stride_u,

+                const uint8* src_v, int src_stride_v,

+                uint8* dst_rgb24, int dst_stride_rgb24,

+                int width, int height) {

+  int y;

+  void (*I422ToRGB24Row)(const uint8* y_buf,

+                         const uint8* u_buf,

+                         const uint8* v_buf,

+                         uint8* rgb_buf,

+                         int width) = I422ToRGB24Row_C;

+  if (!src_y || !src_u || !src_v || !dst_rgb24 ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;

+    dst_stride_rgb24 = -dst_stride_rgb24;

+  }

+#if defined(HAS_I422TORGB24ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;

+    }

+  }

+#elif defined(HAS_I422TORGB24ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGB24Row = I422ToRGB24Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);

+    dst_rgb24 += dst_stride_rgb24;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 to RAW.

+LIBYUV_API

+int I420ToRAW(const uint8* src_y, int src_stride_y,

+                const uint8* src_u, int src_stride_u,

+                const uint8* src_v, int src_stride_v,

+                uint8* dst_raw, int dst_stride_raw,

+                int width, int height) {

+  int y;

+  void (*I422ToRAWRow)(const uint8* y_buf,

+                       const uint8* u_buf,

+                       const uint8* v_buf,

+                       uint8* rgb_buf,

+                       int width) = I422ToRAWRow_C;

+  if (!src_y || !src_u || !src_v || !dst_raw ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_raw = dst_raw + (height - 1) * dst_stride_raw;

+    dst_stride_raw = -dst_stride_raw;

+  }

+#if defined(HAS_I422TORAWROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRAWRow = I422ToRAWRow_SSSE3;

+    }

+  }

+#elif defined(HAS_I422TORAWROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToRAWRow = I422ToRAWRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRAWRow = I422ToRAWRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);

+    dst_raw += dst_stride_raw;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 to ARGB1555.

+LIBYUV_API

+int I420ToARGB1555(const uint8* src_y, int src_stride_y,

+                   const uint8* src_u, int src_stride_u,

+                   const uint8* src_v, int src_stride_v,

+                   uint8* dst_argb1555, int dst_stride_argb1555,

+                   int width, int height) {

+  int y;

+  void (*I422ToARGB1555Row)(const uint8* y_buf,

+                            const uint8* u_buf,

+                            const uint8* v_buf,

+                            uint8* rgb_buf,

+                            int width) = I422ToARGB1555Row_C;

+  if (!src_y || !src_u || !src_v || !dst_argb1555 ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;

+    dst_stride_argb1555 = -dst_stride_argb1555;

+  }

+#if defined(HAS_I422TOARGB1555ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;

+    }

+  }

+#elif defined(HAS_I422TOARGB1555ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);

+    dst_argb1555 += dst_stride_argb1555;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 to ARGB4444.

+LIBYUV_API

+int I420ToARGB4444(const uint8* src_y, int src_stride_y,

+                   const uint8* src_u, int src_stride_u,

+                   const uint8* src_v, int src_stride_v,

+                   uint8* dst_argb4444, int dst_stride_argb4444,

+                   int width, int height) {

+  int y;

+  void (*I422ToARGB4444Row)(const uint8* y_buf,

+                            const uint8* u_buf,

+                            const uint8* v_buf,

+                            uint8* rgb_buf,

+                            int width) = I422ToARGB4444Row_C;

+  if (!src_y || !src_u || !src_v || !dst_argb4444 ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;

+    dst_stride_argb4444 = -dst_stride_argb4444;

+  }

+#if defined(HAS_I422TOARGB4444ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;

+    }

+  }

+#elif defined(HAS_I422TOARGB4444ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);

+    dst_argb4444 += dst_stride_argb4444;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 to RGB565.

+LIBYUV_API

+int I420ToRGB565(const uint8* src_y, int src_stride_y,

+                 const uint8* src_u, int src_stride_u,

+                 const uint8* src_v, int src_stride_v,

+                 uint8* dst_rgb565, int dst_stride_rgb565,

+                 int width, int height) {

+  int y;

+  void (*I422ToRGB565Row)(const uint8* y_buf,

+                          const uint8* u_buf,

+                          const uint8* v_buf,

+                          uint8* rgb_buf,

+                          int width) = I422ToRGB565Row_C;

+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;

+    dst_stride_rgb565 = -dst_stride_rgb565;

+  }

+#if defined(HAS_I422TORGB565ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;

+    }

+  }

+#elif defined(HAS_I422TORGB565ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGB565Row = I422ToRGB565Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);

+    dst_rgb565 += dst_stride_rgb565;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 to specified format

+LIBYUV_API

+int ConvertFromI420(const uint8* y, int y_stride,

+                    const uint8* u, int u_stride,

+                    const uint8* v, int v_stride,

+                    uint8* dst_sample, int dst_sample_stride,

+                    int width, int height,

+                    uint32 fourcc) {

+  uint32 format = CanonicalFourCC(fourcc);

+  int r = 0;

+  if (!y || !u|| !v || !dst_sample ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  switch (format) {

+    // Single plane formats

+    case FOURCC_YUY2:

+      r = I420ToYUY2(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 2,

+                     width, height);

+      break;

+    case FOURCC_UYVY:

+      r = I420ToUYVY(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 2,

+                     width, height);

+      break;

+    case FOURCC_RGBP:

+      r = I420ToRGB565(y, y_stride,

+                       u, u_stride,

+                       v, v_stride,

+                       dst_sample,

+                       dst_sample_stride ? dst_sample_stride : width * 2,

+                       width, height);

+      break;

+    case FOURCC_RGBO:

+      r = I420ToARGB1555(y, y_stride,

+                         u, u_stride,

+                         v, v_stride,

+                         dst_sample,

+                         dst_sample_stride ? dst_sample_stride : width * 2,

+                         width, height);

+      break;

+    case FOURCC_R444:

+      r = I420ToARGB4444(y, y_stride,

+                         u, u_stride,

+                         v, v_stride,

+                         dst_sample,

+                         dst_sample_stride ? dst_sample_stride : width * 2,

+                         width, height);

+      break;

+    case FOURCC_24BG:

+      r = I420ToRGB24(y, y_stride,

+                      u, u_stride,

+                      v, v_stride,

+                      dst_sample,

+                      dst_sample_stride ? dst_sample_stride : width * 3,

+                      width, height);

+      break;

+    case FOURCC_RAW:

+      r = I420ToRAW(y, y_stride,

+                    u, u_stride,

+                    v, v_stride,

+                    dst_sample,

+                    dst_sample_stride ? dst_sample_stride : width * 3,

+                    width, height);

+      break;

+    case FOURCC_ARGB:

+      r = I420ToARGB(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 4,

+                     width, height);

+      break;

+    case FOURCC_BGRA:

+      r = I420ToBGRA(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 4,

+                     width, height);

+      break;

+    case FOURCC_ABGR:

+      r = I420ToABGR(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 4,

+                     width, height);

+      break;

+    case FOURCC_RGBA:

+      r = I420ToRGBA(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 4,

+                     width, height);

+      break;

+    case FOURCC_BGGR:

+      r = I420ToBayerBGGR(y, y_stride,

+                          u, u_stride,

+                          v, v_stride,

+                          dst_sample,

+                          dst_sample_stride ? dst_sample_stride : width,

+                          width, height);

+      break;

+    case FOURCC_GBRG:

+      r = I420ToBayerGBRG(y, y_stride,

+                          u, u_stride,

+                          v, v_stride,

+                          dst_sample,

+                          dst_sample_stride ? dst_sample_stride : width,

+                          width, height);

+      break;

+    case FOURCC_GRBG:

+      r = I420ToBayerGRBG(y, y_stride,

+                          u, u_stride,

+                          v, v_stride,

+                          dst_sample,

+                          dst_sample_stride ? dst_sample_stride : width,

+                          width, height);

+      break;

+    case FOURCC_RGGB:

+      r = I420ToBayerRGGB(y, y_stride,

+                          u, u_stride,

+                          v, v_stride,

+                          dst_sample,

+                          dst_sample_stride ? dst_sample_stride : width,

+                          width, height);

+      break;

+    case FOURCC_I400:

+      r = I400Copy(y, y_stride,

+                   dst_sample,

+                   dst_sample_stride ? dst_sample_stride : width,

+                   width, height);

+      break;

+    case FOURCC_NV12: {

+      uint8* dst_uv = dst_sample + width * height;

+      r = I420ToNV12(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width,

+                     dst_uv,

+                     dst_sample_stride ? dst_sample_stride : width,

+                     width, height);

+      break;

+    }

+    case FOURCC_NV21: {

+      uint8* dst_vu = dst_sample + width * height;

+      r = I420ToNV21(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width,

+                     dst_vu,

+                     dst_sample_stride ? dst_sample_stride : width,

+                     width, height);

+      break;

+    }

+    // TODO(fbarchard): Add M420 and Q420.

+    // Triplanar formats

+    // TODO(fbarchard): halfstride instead of halfwidth

+    case FOURCC_I420:

+    case FOURCC_YU12:

+    case FOURCC_YV12: {

+      int halfwidth = (width + 1) / 2;

+      int halfheight = (height + 1) / 2;

+      uint8* dst_u;

+      uint8* dst_v;

+      if (format == FOURCC_YV12) {

+        dst_v = dst_sample + width * height;

+        dst_u = dst_v + halfwidth * halfheight;

+      } else {

+        dst_u = dst_sample + width * height;

+        dst_v = dst_u + halfwidth * halfheight;

+      }

+      r = I420Copy(y, y_stride,

+                   u, u_stride,

+                   v, v_stride,

+                   dst_sample, width,

+                   dst_u, halfwidth,

+                   dst_v, halfwidth,

+                   width, height);

+      break;

+    }

+    case FOURCC_I422:

+    case FOURCC_YV16: {

+      int halfwidth = (width + 1) / 2;

+      uint8* dst_u;

+      uint8* dst_v;

+      if (format == FOURCC_YV16) {

+        dst_v = dst_sample + width * height;

+        dst_u = dst_v + halfwidth * height;

+      } else {

+        dst_u = dst_sample + width * height;

+        dst_v = dst_u + halfwidth * height;

+      }

+      r = I420ToI422(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample, width,

+                     dst_u, halfwidth,

+                     dst_v, halfwidth,

+                     width, height);

+      break;

+    }

+    case FOURCC_I444:

+    case FOURCC_YV24: {

+      uint8* dst_u;

+      uint8* dst_v;

+      if (format == FOURCC_YV24) {

+        dst_v = dst_sample + width * height;

+        dst_u = dst_v + width * height;

+      } else {

+        dst_u = dst_sample + width * height;

+        dst_v = dst_u + width * height;

+      }

+      r = I420ToI444(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample, width,

+                     dst_u, width,

+                     dst_v, width,

+                     width, height);

+      break;

+    }

+    case FOURCC_I411: {

+      int quarterwidth = (width + 3) / 4;

+      uint8* dst_u = dst_sample + width * height;

+      uint8* dst_v = dst_u + quarterwidth * height;

+      r = I420ToI411(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     dst_sample, width,

+                     dst_u, quarterwidth,

+                     dst_v, quarterwidth,

+                     width, height);

+      break;

+    }

+    // Formats not supported - MJPG, biplanar, some rgb formats.

+    default:

+      return -1;  // unknown fourcc - return failure code.

+  }

+  return r;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/convert_from_argb.cc

@@ -1,0 +1,1113 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/convert_from_argb.h"

+#include "libyuv/basic_types.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/format_conversion.h"

+#include "libyuv/planar_functions.h"

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// ARGB little endian (bgra in memory) to I444

+LIBYUV_API

+int ARGBToI444(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+      int pix) = ARGBToUV444Row_C;

+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_y == width &&

+      dst_stride_u == width &&

+      dst_stride_v == width) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;

+  }

+#if defined(HAS_ARGBTOUV444ROW_SSSE3)

+    if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;

+      if (IS_ALIGNED(width, 16)) {

+        ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;

+        if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+          ARGBToUV444Row = ARGBToUV444Row_SSSE3;

+        }

+      }

+  }

+#endif

+#if defined(HAS_ARGBTOYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&

+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_ARGBTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYRow = ARGBToYRow_Any_NEON;

+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYRow = ARGBToYRow_NEON;

+      ARGBToUV444Row = ARGBToUV444Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToUV444Row(src_argb, dst_u, dst_v, width);

+    ARGBToYRow(src_argb, dst_y, width);

+    src_argb += src_stride_argb;

+    dst_y += dst_stride_y;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  return 0;

+}

+// ARGB little endian (bgra in memory) to I422

+LIBYUV_API

+int ARGBToI422(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+      int pix) = ARGBToUV422Row_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_y == width &&

+      dst_stride_u * 2 == width &&

+      dst_stride_v * 2 == width) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;

+  }

+#if defined(HAS_ARGBTOUV422ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+        ARGBToUV422Row = ARGBToUV422Row_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&

+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_ARGBTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYRow = ARGBToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYRow = ARGBToYRow_NEON;

+    }

+    if (width >= 16) {

+      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ARGBToUV422Row = ARGBToUV422Row_NEON;

+      }

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToUV422Row(src_argb, dst_u, dst_v, width);

+    ARGBToYRow(src_argb, dst_y, width);

+    src_argb += src_stride_argb;

+    dst_y += dst_stride_y;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  return 0;

+}

+// ARGB little endian (bgra in memory) to I411

+LIBYUV_API

+int ARGBToI411(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+      int pix) = ARGBToUV411Row_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_y == width &&

+      dst_stride_u * 4 == width &&

+      dst_stride_v * 4 == width) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;

+  }

+#if defined(HAS_ARGBTOYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&

+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {

+    ARGBToYRow = ARGBToYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToYRow = ARGBToYRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYRow = ARGBToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYRow = ARGBToYRow_NEON;

+    }

+    if (width >= 32) {

+      ARGBToUV411Row = ARGBToUV411Row_Any_NEON;

+      if (IS_ALIGNED(width, 32)) {

+        ARGBToUV411Row = ARGBToUV411Row_NEON;

+      }

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToUV411Row(src_argb, dst_u, dst_v, width);

+    ARGBToYRow(src_argb, dst_y, width);

+    src_argb += src_stride_argb;

+    dst_y += dst_stride_y;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  return 0;

+}

+LIBYUV_API

+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_uv, int dst_stride_uv,

+               int width, int height) {

+  int y;

+  int halfwidth = (width + 1) >> 1;

+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                      int width) = MergeUVRow_C;

+  // Allocate a rows of uv.

+  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);

+  uint8* row_v = row_u + ((halfwidth + 15) & ~15);

+  if (!src_argb ||

+      !dst_y || !dst_uv ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+        ARGBToUVRow = ARGBToUVRow_SSSE3;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          ARGBToYRow = ARGBToYRow_SSSE3;

+        }

+      }

+    }

+  }

+#elif defined(HAS_ARGBTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYRow = ARGBToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYRow = ARGBToYRow_NEON;

+    }

+    if (width >= 16) {

+      ARGBToUVRow = ARGBToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ARGBToUVRow = ARGBToUVRow_NEON;

+      }

+    }

+  }

+#endif

+#if defined(HAS_MERGEUVROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {

+    MergeUVRow_ = MergeUVRow_Any_SSE2;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;

+      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {

+        MergeUVRow_ = MergeUVRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_MERGEUVROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {

+    MergeUVRow_ = MergeUVRow_Any_AVX2;

+    if (IS_ALIGNED(halfwidth, 32)) {

+      MergeUVRow_ = MergeUVRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_MERGEUVROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {

+    MergeUVRow_ = MergeUVRow_Any_NEON;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      MergeUVRow_ = MergeUVRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);

+    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);

+    ARGBToYRow(src_argb, dst_y, width);

+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);

+    src_argb += src_stride_argb * 2;

+    dst_y += dst_stride_y * 2;

+    dst_uv += dst_stride_uv;

+  }

+  if (height & 1) {

+    ARGBToUVRow(src_argb, 0, row_u, row_v, width);

+    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);

+    ARGBToYRow(src_argb, dst_y, width);

+  }

+  free_aligned_buffer_64(row_u);

+  return 0;

+}

+// Same as NV12 but U and V swapped.

+LIBYUV_API

+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_uv, int dst_stride_uv,

+               int width, int height) {

+  int y;

+  int halfwidth = (width + 1) >> 1;

+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                      int width) = MergeUVRow_C;

+  // Allocate a rows of uv.

+  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);

+  uint8* row_v = row_u + ((halfwidth + 15) & ~15);

+  if (!src_argb ||

+      !dst_y || !dst_uv ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+        ARGBToUVRow = ARGBToUVRow_SSSE3;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          ARGBToYRow = ARGBToYRow_SSSE3;

+        }

+      }

+    }

+  }

+#elif defined(HAS_ARGBTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYRow = ARGBToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYRow = ARGBToYRow_NEON;

+    }

+    if (width >= 16) {

+      ARGBToUVRow = ARGBToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ARGBToUVRow = ARGBToUVRow_NEON;

+      }

+    }

+  }

+#endif

+#if defined(HAS_MERGEUVROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {

+    MergeUVRow_ = MergeUVRow_Any_SSE2;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;

+      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {

+        MergeUVRow_ = MergeUVRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_MERGEUVROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {

+    MergeUVRow_ = MergeUVRow_Any_AVX2;

+    if (IS_ALIGNED(halfwidth, 32)) {

+      MergeUVRow_ = MergeUVRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_MERGEUVROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {

+    MergeUVRow_ = MergeUVRow_Any_NEON;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      MergeUVRow_ = MergeUVRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);

+    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);

+    ARGBToYRow(src_argb, dst_y, width);

+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);

+    src_argb += src_stride_argb * 2;

+    dst_y += dst_stride_y * 2;

+    dst_uv += dst_stride_uv;

+  }

+  if (height & 1) {

+    ARGBToUVRow(src_argb, 0, row_u, row_v, width);

+    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);

+    ARGBToYRow(src_argb, dst_y, width);

+  }

+  free_aligned_buffer_64(row_u);

+  return 0;

+}

+// Convert ARGB to YUY2.

+LIBYUV_API

+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_yuy2, int dst_stride_yuy2,

+               int width, int height) {

+  int y;

+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+      int pix) = ARGBToUV422Row_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,

+      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;

+  if (!src_argb || !dst_yuy2 ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;

+    dst_stride_yuy2 = -dst_stride_yuy2;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_yuy2 == width * 2) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_yuy2 = 0;

+  }

+#if defined(HAS_ARGBTOUV422ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+        ARGBToUV422Row = ARGBToUV422Row_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_ARGBTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYRow = ARGBToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYRow = ARGBToYRow_NEON;

+    }

+    if (width >= 16) {

+      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ARGBToUV422Row = ARGBToUV422Row_NEON;

+      }

+    }

+  }

+#endif

+#if defined(HAS_I422TOYUY2ROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToYUY2Row = I422ToYUY2Row_SSE2;

+    }

+  }

+#elif defined(HAS_I422TOYUY2ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToYUY2Row = I422ToYUY2Row_NEON;

+    }

+  }

+#endif

+  {

+    // Allocate a rows of yuv.

+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);

+    uint8* row_u = row_y + ((width + 63) & ~63);

+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;

+    for (y = 0; y < height; ++y) {

+      ARGBToUV422Row(src_argb, row_u, row_v, width);

+      ARGBToYRow(src_argb, row_y, width);

+      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);

+      src_argb += src_stride_argb;

+      dst_yuy2 += dst_stride_yuy2;

+    }

+    free_aligned_buffer_64(row_y);

+  }

+  return 0;

+}

+// Convert ARGB to UYVY.

+LIBYUV_API

+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_uyvy, int dst_stride_uyvy,

+               int width, int height) {

+  int y;

+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+      int pix) = ARGBToUV422Row_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,

+      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;

+  if (!src_argb || !dst_uyvy ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;

+    dst_stride_uyvy = -dst_stride_uyvy;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_uyvy == width * 2) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_uyvy = 0;

+  }

+#if defined(HAS_ARGBTOUV422ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+        ARGBToUV422Row = ARGBToUV422Row_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_ARGBTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYRow = ARGBToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYRow = ARGBToYRow_NEON;

+    }

+    if (width >= 16) {

+      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ARGBToUV422Row = ARGBToUV422Row_NEON;

+      }

+    }

+  }

+#endif

+#if defined(HAS_I422TOUYVYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToUYVYRow = I422ToUYVYRow_SSE2;

+    }

+  }

+#elif defined(HAS_I422TOUYVYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToUYVYRow = I422ToUYVYRow_NEON;

+    }

+  }

+#endif

+  {

+    // Allocate a rows of yuv.

+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);

+    uint8* row_u = row_y + ((width + 63) & ~63);

+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;

+    for (y = 0; y < height; ++y) {

+      ARGBToUV422Row(src_argb, row_u, row_v, width);

+      ARGBToYRow(src_argb, row_y, width);

+      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);

+      src_argb += src_stride_argb;

+      dst_uyvy += dst_stride_uyvy;

+    }

+    free_aligned_buffer_64(row_y);

+  }

+  return 0;

+}

+// Convert ARGB to I400.

+LIBYUV_API

+int ARGBToI400(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height) {

+  int y;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  if (!src_argb || !dst_y || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_y = 0;

+  }

+#if defined(HAS_ARGBTOYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&

+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {

+    ARGBToYRow = ARGBToYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToYRow = ARGBToYRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYRow = ARGBToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYRow = ARGBToYRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToYRow(src_argb, dst_y, width);

+    src_argb += src_stride_argb;

+    dst_y += dst_stride_y;

+  }

+  return 0;

+}

+// Shuffle table for converting ARGB to RGBA.

+static uvec8 kShuffleMaskARGBToRGBA = {

+  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u

+};

+// Convert ARGB to RGBA.

+LIBYUV_API

+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_rgba, int dst_stride_rgba,

+               int width, int height) {

+  return ARGBShuffle(src_argb, src_stride_argb,

+                     dst_rgba, dst_stride_rgba,

+                     (const uint8*)(&kShuffleMaskARGBToRGBA),

+                     width, height);

+}

+// Convert ARGB To RGB24.

+LIBYUV_API

+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,

+                uint8* dst_rgb24, int dst_stride_rgb24,

+                int width, int height) {

+  int y;

+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =

+      ARGBToRGB24Row_C;

+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_rgb24 == width * 3) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_rgb24 = 0;

+  }

+#if defined(HAS_ARGBTORGB24ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;

+    }

+  }

+#elif defined(HAS_ARGBTORGB24ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToRGB24Row(src_argb, dst_rgb24, width);

+    src_argb += src_stride_argb;

+    dst_rgb24 += dst_stride_rgb24;

+  }

+  return 0;

+}

+// Convert ARGB To RAW.

+LIBYUV_API

+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,

+              uint8* dst_raw, int dst_stride_raw,

+              int width, int height) {

+  int y;

+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =

+      ARGBToRAWRow_C;

+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_raw == width * 3) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_raw = 0;

+  }

+#if defined(HAS_ARGBTORAWROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToRAWRow = ARGBToRAWRow_SSSE3;

+    }

+  }

+#elif defined(HAS_ARGBTORAWROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToRAWRow = ARGBToRAWRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToRAWRow(src_argb, dst_raw, width);

+    src_argb += src_stride_argb;

+    dst_raw += dst_stride_raw;

+  }

+  return 0;

+}

+// Convert ARGB To RGB565.

+LIBYUV_API

+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,

+                 uint8* dst_rgb565, int dst_stride_rgb565,

+                 int width, int height) {

+  int y;

+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =

+      ARGBToRGB565Row_C;

+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_rgb565 == width * 2) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_rgb565 = 0;

+  }

+#if defined(HAS_ARGBTORGB565ROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;

+    }

+  }

+#elif defined(HAS_ARGBTORGB565ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToRGB565Row = ARGBToRGB565Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToRGB565Row(src_argb, dst_rgb565, width);

+    src_argb += src_stride_argb;

+    dst_rgb565 += dst_stride_rgb565;

+  }

+  return 0;

+}

+// Convert ARGB To ARGB1555.

+LIBYUV_API

+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,

+                   uint8* dst_argb1555, int dst_stride_argb1555,

+                   int width, int height) {

+  int y;

+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =

+      ARGBToARGB1555Row_C;

+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_argb1555 == width * 2) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_argb1555 = 0;

+  }

+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;

+    }

+  }

+#elif defined(HAS_ARGBTOARGB1555ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToARGB1555Row(src_argb, dst_argb1555, width);

+    src_argb += src_stride_argb;

+    dst_argb1555 += dst_stride_argb1555;

+  }

+  return 0;

+}

+// Convert ARGB To ARGB4444.

+LIBYUV_API

+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,

+                   uint8* dst_argb4444, int dst_stride_argb4444,

+                   int width, int height) {

+  int y;

+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =

+      ARGBToARGB4444Row_C;

+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_argb4444 == width * 2) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_argb4444 = 0;

+  }

+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;

+    }

+  }

+#elif defined(HAS_ARGBTOARGB4444ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToARGB4444Row(src_argb, dst_argb4444, width);

+    src_argb += src_stride_argb;

+    dst_argb4444 += dst_stride_argb4444;

+  }

+  return 0;

+}

+// Convert ARGB to J420. (JPeg full range I420).

+LIBYUV_API

+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_yj, int dst_stride_yj,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,

+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;

+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =

+      ARGBToYJRow_C;

+  if (!src_argb ||

+      !dst_yj || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;

+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;

+      ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+        ARGBToUVJRow = ARGBToUVJRow_SSSE3;

+        if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {

+          ARGBToYJRow = ARGBToYJRow_SSSE3;

+        }

+      }

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {

+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToYJRow = ARGBToYJRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYJROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYJRow = ARGBToYJRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYJRow = ARGBToYJRow_NEON;

+    }

+    if (width >= 16) {

+      ARGBToUVJRow = ARGBToUVJRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ARGBToUVJRow = ARGBToUVJRow_NEON;

+      }

+    }

+  }

+#endif

+  for (y = 0; y < height - 1; y += 2) {

+    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);

+    ARGBToYJRow(src_argb, dst_yj, width);

+    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);

+    src_argb += src_stride_argb * 2;

+    dst_yj += dst_stride_yj * 2;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  if (height & 1) {

+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);

+    ARGBToYJRow(src_argb, dst_yj, width);

+  }

+  return 0;

+}

+// Convert ARGB to J400.

+LIBYUV_API

+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_yj, int dst_stride_yj,

+               int width, int height) {

+  int y;

+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =

+      ARGBToYJRow_C;

+  if (!src_argb || !dst_yj || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_yj == width) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_yj = 0;

+  }

+#if defined(HAS_ARGBTOYJROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&

+          IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {

+        ARGBToYJRow = ARGBToYJRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYJROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {

+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToYJRow = ARGBToYJRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYJROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYJRow = ARGBToYJRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYJRow = ARGBToYJRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToYJRow(src_argb, dst_yj, width);

+    src_argb += src_stride_argb;

+    dst_yj += dst_stride_yj;

+  }

+  return 0;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/convert_jpeg.cc

@@ -1,0 +1,392 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/convert.h"

+#ifdef HAVE_JPEG

+#include "libyuv/mjpeg_decoder.h"

+#endif

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#ifdef HAVE_JPEG

+struct I420Buffers {

+  uint8* y;

+  int y_stride;

+  uint8* u;

+  int u_stride;

+  uint8* v;

+  int v_stride;

+  int w;

+  int h;

+};

+static void JpegCopyI420(void* opaque,

+                         const uint8* const* data,

+                         const int* strides,

+                         int rows) {

+  I420Buffers* dest = (I420Buffers*)(opaque);

+  I420Copy(data[0], strides[0],

+           data[1], strides[1],

+           data[2], strides[2],

+           dest->y, dest->y_stride,

+           dest->u, dest->u_stride,

+           dest->v, dest->v_stride,

+           dest->w, rows);

+  dest->y += rows * dest->y_stride;

+  dest->u += ((rows + 1) >> 1) * dest->u_stride;

+  dest->v += ((rows + 1) >> 1) * dest->v_stride;

+  dest->h -= rows;

+}

+static void JpegI422ToI420(void* opaque,

+                           const uint8* const* data,

+                           const int* strides,

+                           int rows) {

+  I420Buffers* dest = (I420Buffers*)(opaque);

+  I422ToI420(data[0], strides[0],

+             data[1], strides[1],

+             data[2], strides[2],

+             dest->y, dest->y_stride,

+             dest->u, dest->u_stride,

+             dest->v, dest->v_stride,

+             dest->w, rows);

+  dest->y += rows * dest->y_stride;

+  dest->u += ((rows + 1) >> 1) * dest->u_stride;

+  dest->v += ((rows + 1) >> 1) * dest->v_stride;

+  dest->h -= rows;

+}

+static void JpegI444ToI420(void* opaque,

+                           const uint8* const* data,

+                           const int* strides,

+                           int rows) {

+  I420Buffers* dest = (I420Buffers*)(opaque);

+  I444ToI420(data[0], strides[0],

+             data[1], strides[1],

+             data[2], strides[2],

+             dest->y, dest->y_stride,

+             dest->u, dest->u_stride,

+             dest->v, dest->v_stride,

+             dest->w, rows);

+  dest->y += rows * dest->y_stride;

+  dest->u += ((rows + 1) >> 1) * dest->u_stride;

+  dest->v += ((rows + 1) >> 1) * dest->v_stride;

+  dest->h -= rows;

+}

+static void JpegI411ToI420(void* opaque,

+                           const uint8* const* data,

+                           const int* strides,

+                           int rows) {

+  I420Buffers* dest = (I420Buffers*)(opaque);

+  I411ToI420(data[0], strides[0],

+             data[1], strides[1],

+             data[2], strides[2],

+             dest->y, dest->y_stride,

+             dest->u, dest->u_stride,

+             dest->v, dest->v_stride,

+             dest->w, rows);

+  dest->y += rows * dest->y_stride;

+  dest->u += ((rows + 1) >> 1) * dest->u_stride;

+  dest->v += ((rows + 1) >> 1) * dest->v_stride;

+  dest->h -= rows;

+}

+static void JpegI400ToI420(void* opaque,

+                           const uint8* const* data,

+                           const int* strides,

+                           int rows) {

+  I420Buffers* dest = (I420Buffers*)(opaque);

+  I400ToI420(data[0], strides[0],

+             dest->y, dest->y_stride,

+             dest->u, dest->u_stride,

+             dest->v, dest->v_stride,

+             dest->w, rows);

+  dest->y += rows * dest->y_stride;

+  dest->u += ((rows + 1) >> 1) * dest->u_stride;

+  dest->v += ((rows + 1) >> 1) * dest->v_stride;

+  dest->h -= rows;

+}

+// Query size of MJPG in pixels.

+LIBYUV_API

+int MJPGSize(const uint8* sample, size_t sample_size,

+             int* width, int* height) {

+  MJpegDecoder mjpeg_decoder;

+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);

+  if (ret) {

+    *width = mjpeg_decoder.GetWidth();

+    *height = mjpeg_decoder.GetHeight();

+  }

+  mjpeg_decoder.UnloadFrame();

+  return ret ? 0 : -1;  // -1 for runtime failure.

+}

+// MJPG (Motion JPeg) to I420

+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.

+LIBYUV_API

+int MJPGToI420(const uint8* sample,

+               size_t sample_size,

+               uint8* y, int y_stride,

+               uint8* u, int u_stride,

+               uint8* v, int v_stride,

+               int w, int h,

+               int dw, int dh) {

+  if (sample_size == kUnknownDataSize) {

+    // ERROR: MJPEG frame size unknown

+    return -1;

+  }

+  // TODO(fbarchard): Port MJpeg to C.

+  MJpegDecoder mjpeg_decoder;

+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);

+  if (ret && (mjpeg_decoder.GetWidth() != w ||

+              mjpeg_decoder.GetHeight() != h)) {

+    // ERROR: MJPEG frame has unexpected dimensions

+    mjpeg_decoder.UnloadFrame();

+    return 1;  // runtime failure

+  }

+  if (ret) {

+    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };

+    // YUV420

+    if (mjpeg_decoder.GetColorSpace() ==

+            MJpegDecoder::kColorSpaceYCbCr &&

+        mjpeg_decoder.GetNumComponents() == 3 &&

+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&

+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&

+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&

+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&

+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {

+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);

+    // YUV422

+    } else if (mjpeg_decoder.GetColorSpace() ==

+                   MJpegDecoder::kColorSpaceYCbCr &&

+               mjpeg_decoder.GetNumComponents() == 3 &&

+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&

+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);

+    // YUV444

+    } else if (mjpeg_decoder.GetColorSpace() ==

+                   MJpegDecoder::kColorSpaceYCbCr &&

+               mjpeg_decoder.GetNumComponents() == 3 &&

+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&

+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);

+    // YUV411

+    } else if (mjpeg_decoder.GetColorSpace() ==

+                   MJpegDecoder::kColorSpaceYCbCr &&

+               mjpeg_decoder.GetNumComponents() == 3 &&

+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&

+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);

+    // YUV400

+    } else if (mjpeg_decoder.GetColorSpace() ==

+                   MJpegDecoder::kColorSpaceGrayscale &&

+               mjpeg_decoder.GetNumComponents() == 1 &&

+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);

+    } else {

+      // TODO(fbarchard): Implement conversion for any other colorspace/sample

+      // factors that occur in practice. 411 is supported by libjpeg

+      // ERROR: Unable to convert MJPEG frame because format is not supported

+      mjpeg_decoder.UnloadFrame();

+      return 1;

+    }

+  }

+  return ret ? 0 : 1;

+}

+#ifdef HAVE_JPEG

+struct ARGBBuffers {

+  uint8* argb;

+  int argb_stride;

+  int w;

+  int h;

+};

+static void JpegI420ToARGB(void* opaque,

+                         const uint8* const* data,

+                         const int* strides,

+                         int rows) {

+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);

+  I420ToARGB(data[0], strides[0],

+             data[1], strides[1],

+             data[2], strides[2],

+             dest->argb, dest->argb_stride,

+             dest->w, rows);

+  dest->argb += rows * dest->argb_stride;

+  dest->h -= rows;

+}

+static void JpegI422ToARGB(void* opaque,

+                           const uint8* const* data,

+                           const int* strides,

+                           int rows) {

+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);

+  I422ToARGB(data[0], strides[0],

+             data[1], strides[1],

+             data[2], strides[2],

+             dest->argb, dest->argb_stride,

+             dest->w, rows);

+  dest->argb += rows * dest->argb_stride;

+  dest->h -= rows;

+}

+static void JpegI444ToARGB(void* opaque,

+                           const uint8* const* data,

+                           const int* strides,

+                           int rows) {

+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);

+  I444ToARGB(data[0], strides[0],

+             data[1], strides[1],

+             data[2], strides[2],

+             dest->argb, dest->argb_stride,

+             dest->w, rows);

+  dest->argb += rows * dest->argb_stride;

+  dest->h -= rows;

+}

+static void JpegI411ToARGB(void* opaque,

+                           const uint8* const* data,

+                           const int* strides,

+                           int rows) {

+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);

+  I411ToARGB(data[0], strides[0],

+             data[1], strides[1],

+             data[2], strides[2],

+             dest->argb, dest->argb_stride,

+             dest->w, rows);

+  dest->argb += rows * dest->argb_stride;

+  dest->h -= rows;

+}

+static void JpegI400ToARGB(void* opaque,

+                           const uint8* const* data,

+                           const int* strides,

+                           int rows) {

+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);

+  I400ToARGB(data[0], strides[0],

+             dest->argb, dest->argb_stride,

+             dest->w, rows);

+  dest->argb += rows * dest->argb_stride;

+  dest->h -= rows;

+}

+// MJPG (Motion JPeg) to ARGB

+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.

+LIBYUV_API

+int MJPGToARGB(const uint8* sample,

+               size_t sample_size,

+               uint8* argb, int argb_stride,

+               int w, int h,

+               int dw, int dh) {

+  if (sample_size == kUnknownDataSize) {

+    // ERROR: MJPEG frame size unknown

+    return -1;

+  }

+  // TODO(fbarchard): Port MJpeg to C.

+  MJpegDecoder mjpeg_decoder;

+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);

+  if (ret && (mjpeg_decoder.GetWidth() != w ||

+              mjpeg_decoder.GetHeight() != h)) {

+    // ERROR: MJPEG frame has unexpected dimensions

+    mjpeg_decoder.UnloadFrame();

+    return 1;  // runtime failure

+  }

+  if (ret) {

+    ARGBBuffers bufs = { argb, argb_stride, dw, dh };

+    // YUV420

+    if (mjpeg_decoder.GetColorSpace() ==

+            MJpegDecoder::kColorSpaceYCbCr &&

+        mjpeg_decoder.GetNumComponents() == 3 &&

+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&

+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&

+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&

+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&

+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);

+    // YUV422

+    } else if (mjpeg_decoder.GetColorSpace() ==

+                   MJpegDecoder::kColorSpaceYCbCr &&

+               mjpeg_decoder.GetNumComponents() == 3 &&

+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&

+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);

+    // YUV444

+    } else if (mjpeg_decoder.GetColorSpace() ==

+                   MJpegDecoder::kColorSpaceYCbCr &&

+               mjpeg_decoder.GetNumComponents() == 3 &&

+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&

+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);

+    // YUV411

+    } else if (mjpeg_decoder.GetColorSpace() ==

+                   MJpegDecoder::kColorSpaceYCbCr &&

+               mjpeg_decoder.GetNumComponents() == 3 &&

+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&

+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);

+    // YUV400

+    } else if (mjpeg_decoder.GetColorSpace() ==

+                   MJpegDecoder::kColorSpaceGrayscale &&

+               mjpeg_decoder.GetNumComponents() == 1 &&

+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&

+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);

+    } else {

+      // TODO(fbarchard): Implement conversion for any other colorspace/sample

+      // factors that occur in practice. 411 is supported by libjpeg

+      // ERROR: Unable to convert MJPEG frame because format is not supported

+      mjpeg_decoder.UnloadFrame();

+      return 1;

+    }

+  }

+  return ret ? 0 : 1;

+}

+#endif

+#endif

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/convert_to_argb.cc

@@ -1,0 +1,327 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/convert_argb.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/format_conversion.h"

+#ifdef HAVE_JPEG

+#include "libyuv/mjpeg_decoder.h"

+#endif

+#include "libyuv/rotate_argb.h"

+#include "libyuv/row.h"

+#include "libyuv/video_common.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Convert camera sample to I420 with cropping, rotation and vertical flip.

+// src_width is used for source stride computation

+// src_height is used to compute location of planes, and indicate inversion

+// sample_size is measured in bytes and is the size of the frame.

+//   With MJPEG it is the compressed size of the frame.

+LIBYUV_API

+int ConvertToARGB(const uint8* sample, size_t sample_size,

+                  uint8* crop_argb, int argb_stride,

+                  int crop_x, int crop_y,

+                  int src_width, int src_height,

+                  int crop_width, int crop_height,

+                  enum RotationMode rotation,

+                  uint32 fourcc) {

+  uint32 format = CanonicalFourCC(fourcc);

+  int aligned_src_width = (src_width + 1) & ~1;

+  const uint8* src;

+  const uint8* src_uv;

+  int abs_src_height = (src_height < 0) ? -src_height : src_height;

+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;

+  int r = 0;

+  // One pass rotation is available for some formats. For the rest, convert

+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,

+  // and then rotate the I420 to the final destination buffer.

+  // For in-place conversion, if destination crop_argb is same as source sample,

+  // also enable temporary buffer.

+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||

+      crop_argb == sample;

+  uint8* tmp_argb = crop_argb;

+  int tmp_argb_stride = argb_stride;

+  uint8* rotate_buffer = NULL;

+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

+  if (crop_argb == NULL || sample == NULL ||

+      src_width <= 0 || crop_width <= 0 ||

+      src_height == 0 || crop_height == 0) {

+    return -1;

+  }

+  if (src_height < 0) {

+    inv_crop_height = -inv_crop_height;

+  }

+  if (need_buf) {

+    int argb_size = crop_width * abs_crop_height * 4;

+    rotate_buffer = (uint8*)malloc(argb_size);

+    if (!rotate_buffer) {

+      return 1;  // Out of memory runtime error.

+    }

+    crop_argb = rotate_buffer;

+    argb_stride = crop_width;

+  }

+  switch (format) {

+    // Single plane formats

+    case FOURCC_YUY2:

+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;

+      r = YUY2ToARGB(src, aligned_src_width * 2,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_UYVY:

+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;

+      r = UYVYToARGB(src, aligned_src_width * 2,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_24BG:

+      src = sample + (src_width * crop_y + crop_x) * 3;

+      r = RGB24ToARGB(src, src_width * 3,

+                      crop_argb, argb_stride,

+                      crop_width, inv_crop_height);

+      break;

+    case FOURCC_RAW:

+      src = sample + (src_width * crop_y + crop_x) * 3;

+      r = RAWToARGB(src, src_width * 3,

+                    crop_argb, argb_stride,

+                    crop_width, inv_crop_height);

+      break;

+    case FOURCC_ARGB:

+      src = sample + (src_width * crop_y + crop_x) * 4;

+      r = ARGBToARGB(src, src_width * 4,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_BGRA:

+      src = sample + (src_width * crop_y + crop_x) * 4;

+      r = BGRAToARGB(src, src_width * 4,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_ABGR:

+      src = sample + (src_width * crop_y + crop_x) * 4;

+      r = ABGRToARGB(src, src_width * 4,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_RGBA:

+      src = sample + (src_width * crop_y + crop_x) * 4;

+      r = RGBAToARGB(src, src_width * 4,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_RGBP:

+      src = sample + (src_width * crop_y + crop_x) * 2;

+      r = RGB565ToARGB(src, src_width * 2,

+                       crop_argb, argb_stride,

+                       crop_width, inv_crop_height);

+      break;

+    case FOURCC_RGBO:

+      src = sample + (src_width * crop_y + crop_x) * 2;

+      r = ARGB1555ToARGB(src, src_width * 2,

+                         crop_argb, argb_stride,

+                         crop_width, inv_crop_height);

+      break;

+    case FOURCC_R444:

+      src = sample + (src_width * crop_y + crop_x) * 2;

+      r = ARGB4444ToARGB(src, src_width * 2,

+                         crop_argb, argb_stride,

+                         crop_width, inv_crop_height);

+      break;

+    // TODO(fbarchard): Support cropping Bayer by odd numbers

+    // by adjusting fourcc.

+    case FOURCC_BGGR:

+      src = sample + (src_width * crop_y + crop_x);

+      r = BayerBGGRToARGB(src, src_width,

+                          crop_argb, argb_stride,

+                          crop_width, inv_crop_height);

+      break;

+    case FOURCC_GBRG:

+      src = sample + (src_width * crop_y + crop_x);

+      r = BayerGBRGToARGB(src, src_width,

+                          crop_argb, argb_stride,

+                          crop_width, inv_crop_height);

+      break;

+    case FOURCC_GRBG:

+      src = sample + (src_width * crop_y + crop_x);

+      r = BayerGRBGToARGB(src, src_width,

+                          crop_argb, argb_stride,

+                          crop_width, inv_crop_height);

+      break;

+    case FOURCC_RGGB:

+      src = sample + (src_width * crop_y + crop_x);

+      r = BayerRGGBToARGB(src, src_width,

+                          crop_argb, argb_stride,

+                          crop_width, inv_crop_height);

+      break;

+    case FOURCC_I400:

+      src = sample + src_width * crop_y + crop_x;

+      r = I400ToARGB(src, src_width,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    // Biplanar formats

+    case FOURCC_NV12:

+      src = sample + (src_width * crop_y + crop_x);

+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;

+      r = NV12ToARGB(src, src_width,

+                     src_uv, aligned_src_width,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_NV21:

+      src = sample + (src_width * crop_y + crop_x);

+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;

+      // Call NV12 but with u and v parameters swapped.

+      r = NV21ToARGB(src, src_width,

+                     src_uv, aligned_src_width,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_M420:

+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;

+      r = M420ToARGB(src, src_width,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+//    case FOURCC_Q420:

+//      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;

+//      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +

+//               src_width + crop_x * 2;

+//      r = Q420ToARGB(src, src_width * 3,

+//                    src_uv, src_width * 3,

+//                    crop_argb, argb_stride,

+//                    crop_width, inv_crop_height);

+//      break;

+    // Triplanar formats

+    case FOURCC_I420:

+    case FOURCC_YU12:

+    case FOURCC_YV12: {

+      const uint8* src_y = sample + (src_width * crop_y + crop_x);

+      const uint8* src_u;

+      const uint8* src_v;

+      int halfwidth = (src_width + 1) / 2;

+      int halfheight = (abs_src_height + 1) / 2;

+      if (format == FOURCC_YV12) {

+        src_v = sample + src_width * abs_src_height +

+            (halfwidth * crop_y + crop_x) / 2;

+        src_u = sample + src_width * abs_src_height +

+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

+      } else {

+        src_u = sample + src_width * abs_src_height +

+            (halfwidth * crop_y + crop_x) / 2;

+        src_v = sample + src_width * abs_src_height +

+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

+      }

+      r = I420ToARGB(src_y, src_width,

+                     src_u, halfwidth,

+                     src_v, halfwidth,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    }

+    case FOURCC_I422:

+    case FOURCC_YV16: {

+      const uint8* src_y = sample + src_width * crop_y + crop_x;

+      const uint8* src_u;

+      const uint8* src_v;

+      int halfwidth = (src_width + 1) / 2;

+      if (format == FOURCC_YV16) {

+        src_v = sample + src_width * abs_src_height +

+            halfwidth * crop_y + crop_x / 2;

+        src_u = sample + src_width * abs_src_height +

+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;

+      } else {

+        src_u = sample + src_width * abs_src_height +

+            halfwidth * crop_y + crop_x / 2;

+        src_v = sample + src_width * abs_src_height +

+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;

+      }

+      r = I422ToARGB(src_y, src_width,

+                     src_u, halfwidth,

+                     src_v, halfwidth,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    }

+    case FOURCC_I444:

+    case FOURCC_YV24: {

+      const uint8* src_y = sample + src_width * crop_y + crop_x;

+      const uint8* src_u;

+      const uint8* src_v;

+      if (format == FOURCC_YV24) {

+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;

+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;

+      } else {

+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;

+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;

+      }

+      r = I444ToARGB(src_y, src_width,

+                     src_u, src_width,

+                     src_v, src_width,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    }

+    case FOURCC_I411: {

+      int quarterwidth = (src_width + 3) / 4;

+      const uint8* src_y = sample + src_width * crop_y + crop_x;

+      const uint8* src_u = sample + src_width * abs_src_height +

+          quarterwidth * crop_y + crop_x / 4;

+      const uint8* src_v = sample + src_width * abs_src_height +

+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;

+      r = I411ToARGB(src_y, src_width,

+                     src_u, quarterwidth,

+                     src_v, quarterwidth,

+                     crop_argb, argb_stride,

+                     crop_width, inv_crop_height);

+      break;

+    }

+#ifdef HAVE_JPEG

+    case FOURCC_MJPG:

+      r = MJPGToARGB(sample, sample_size,

+                     crop_argb, argb_stride,

+                     src_width, abs_src_height, crop_width, inv_crop_height);

+      break;

+#endif

+    default:

+      r = -1;  // unknown fourcc - return failure code.

+  }

+  if (need_buf) {

+    if (!r) {

+      r = ARGBRotate(crop_argb, argb_stride,

+                     tmp_argb, tmp_argb_stride,

+                     crop_width, abs_crop_height, rotation);

+    }

+    free(rotate_buffer);

+  }

+  return r;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/convert_to_i420.cc

@@ -1,0 +1,383 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include "libyuv/convert.h"

+#include "libyuv/format_conversion.h"

+#include "libyuv/video_common.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Convert camera sample to I420 with cropping, rotation and vertical flip.

+// src_width is used for source stride computation

+// src_height is used to compute location of planes, and indicate inversion

+// sample_size is measured in bytes and is the size of the frame.

+//   With MJPEG it is the compressed size of the frame.

+LIBYUV_API

+int ConvertToI420(const uint8* sample,

+                  size_t sample_size,

+                  uint8* y, int y_stride,

+                  uint8* u, int u_stride,

+                  uint8* v, int v_stride,

+                  int crop_x, int crop_y,

+                  int src_width, int src_height,

+                  int crop_width, int crop_height,

+                  enum RotationMode rotation,

+                  uint32 fourcc) {

+  uint32 format = CanonicalFourCC(fourcc);

+  int aligned_src_width = (src_width + 1) & ~1;

+  const uint8* src;

+  const uint8* src_uv;

+  int abs_src_height = (src_height < 0) ? -src_height : src_height;

+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;

+  int r = 0;

+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&

+      format != FOURCC_NV12 && format != FOURCC_NV21 &&

+      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;

+  uint8* tmp_y = y;

+  uint8* tmp_u = u;

+  uint8* tmp_v = v;

+  int tmp_y_stride = y_stride;

+  int tmp_u_stride = u_stride;

+  int tmp_v_stride = v_stride;

+  uint8* rotate_buffer = NULL;

+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

+  if (!y || !u || !v || !sample ||

+      src_width <= 0 || crop_width <= 0  ||

+      src_height == 0 || crop_height == 0) {

+    return -1;

+  }

+  if (src_height < 0) {

+    inv_crop_height = -inv_crop_height;

+  }

+  // One pass rotation is available for some formats. For the rest, convert

+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,

+  // and then rotate the I420 to the final destination buffer.

+  // For in-place conversion, if destination y is same as source sample,

+  // also enable temporary buffer.

+  if (need_buf) {

+    int y_size = crop_width * abs_crop_height;

+    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);

+    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);

+    if (!rotate_buffer) {

+      return 1;  // Out of memory runtime error.

+    }

+    y = rotate_buffer;

+    u = y + y_size;

+    v = u + uv_size;

+    y_stride = crop_width;

+    u_stride = v_stride = ((crop_width + 1) / 2);

+  }

+  switch (format) {

+    // Single plane formats

+    case FOURCC_YUY2:

+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;

+      r = YUY2ToI420(src, aligned_src_width * 2,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_UYVY:

+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;

+      r = UYVYToI420(src, aligned_src_width * 2,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_RGBP:

+      src = sample + (src_width * crop_y + crop_x) * 2;

+      r = RGB565ToI420(src, src_width * 2,

+                       y, y_stride,

+                       u, u_stride,

+                       v, v_stride,

+                       crop_width, inv_crop_height);

+      break;

+    case FOURCC_RGBO:

+      src = sample + (src_width * crop_y + crop_x) * 2;

+      r = ARGB1555ToI420(src, src_width * 2,

+                         y, y_stride,

+                         u, u_stride,

+                         v, v_stride,

+                         crop_width, inv_crop_height);

+      break;

+    case FOURCC_R444:

+      src = sample + (src_width * crop_y + crop_x) * 2;

+      r = ARGB4444ToI420(src, src_width * 2,

+                         y, y_stride,

+                         u, u_stride,

+                         v, v_stride,

+                         crop_width, inv_crop_height);

+      break;

+    case FOURCC_24BG:

+      src = sample + (src_width * crop_y + crop_x) * 3;

+      r = RGB24ToI420(src, src_width * 3,

+                      y, y_stride,

+                      u, u_stride,

+                      v, v_stride,

+                      crop_width, inv_crop_height);

+      break;

+    case FOURCC_RAW:

+      src = sample + (src_width * crop_y + crop_x) * 3;

+      r = RAWToI420(src, src_width * 3,

+                    y, y_stride,

+                    u, u_stride,

+                    v, v_stride,

+                    crop_width, inv_crop_height);

+      break;

+    case FOURCC_ARGB:

+      src = sample + (src_width * crop_y + crop_x) * 4;

+      r = ARGBToI420(src, src_width * 4,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_BGRA:

+      src = sample + (src_width * crop_y + crop_x) * 4;

+      r = BGRAToI420(src, src_width * 4,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_ABGR:

+      src = sample + (src_width * crop_y + crop_x) * 4;

+      r = ABGRToI420(src, src_width * 4,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_RGBA:

+      src = sample + (src_width * crop_y + crop_x) * 4;

+      r = RGBAToI420(src, src_width * 4,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    // TODO(fbarchard): Support cropping Bayer by odd numbers

+    // by adjusting fourcc.

+    case FOURCC_BGGR:

+      src = sample + (src_width * crop_y + crop_x);

+      r = BayerBGGRToI420(src, src_width,

+                          y, y_stride,

+                          u, u_stride,

+                          v, v_stride,

+                          crop_width, inv_crop_height);

+      break;

+    case FOURCC_GBRG:

+      src = sample + (src_width * crop_y + crop_x);

+      r = BayerGBRGToI420(src, src_width,

+                          y, y_stride,

+                          u, u_stride,

+                          v, v_stride,

+                          crop_width, inv_crop_height);

+      break;

+    case FOURCC_GRBG:

+      src = sample + (src_width * crop_y + crop_x);

+      r = BayerGRBGToI420(src, src_width,

+                          y, y_stride,

+                          u, u_stride,

+                          v, v_stride,

+                          crop_width, inv_crop_height);

+      break;

+    case FOURCC_RGGB:

+      src = sample + (src_width * crop_y + crop_x);

+      r = BayerRGGBToI420(src, src_width,

+                          y, y_stride,

+                          u, u_stride,

+                          v, v_stride,

+                          crop_width, inv_crop_height);

+      break;

+    case FOURCC_I400:

+      src = sample + src_width * crop_y + crop_x;

+      r = I400ToI420(src, src_width,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    // Biplanar formats

+    case FOURCC_NV12:

+      src = sample + (src_width * crop_y + crop_x);

+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;

+      r = NV12ToI420Rotate(src, src_width,

+                           src_uv, aligned_src_width,

+                           y, y_stride,

+                           u, u_stride,

+                           v, v_stride,

+                           crop_width, inv_crop_height, rotation);

+      break;

+    case FOURCC_NV21:

+      src = sample + (src_width * crop_y + crop_x);

+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;

+      // Call NV12 but with u and v parameters swapped.

+      r = NV12ToI420Rotate(src, src_width,

+                           src_uv, aligned_src_width,

+                           y, y_stride,

+                           v, v_stride,

+                           u, u_stride,

+                           crop_width, inv_crop_height, rotation);

+      break;

+    case FOURCC_M420:

+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;

+      r = M420ToI420(src, src_width,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    case FOURCC_Q420:

+      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;

+      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +

+               src_width + crop_x * 2;

+      r = Q420ToI420(src, src_width * 3,

+                    src_uv, src_width * 3,

+                    y, y_stride,

+                    u, u_stride,

+                    v, v_stride,

+                    crop_width, inv_crop_height);

+      break;

+    // Triplanar formats

+    case FOURCC_I420:

+    case FOURCC_YU12:

+    case FOURCC_YV12: {

+      const uint8* src_y = sample + (src_width * crop_y + crop_x);

+      const uint8* src_u;

+      const uint8* src_v;

+      int halfwidth = (src_width + 1) / 2;

+      int halfheight = (abs_src_height + 1) / 2;

+      if (format == FOURCC_YV12) {

+        src_v = sample + src_width * abs_src_height +

+            (halfwidth * crop_y + crop_x) / 2;

+        src_u = sample + src_width * abs_src_height +

+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

+      } else {

+        src_u = sample + src_width * abs_src_height +

+            (halfwidth * crop_y + crop_x) / 2;

+        src_v = sample + src_width * abs_src_height +

+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

+      }

+      r = I420Rotate(src_y, src_width,

+                     src_u, halfwidth,

+                     src_v, halfwidth,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height, rotation);

+      break;

+    }

+    case FOURCC_I422:

+    case FOURCC_YV16: {

+      const uint8* src_y = sample + src_width * crop_y + crop_x;

+      const uint8* src_u;

+      const uint8* src_v;

+      int halfwidth = (src_width + 1) / 2;

+      if (format == FOURCC_YV16) {

+        src_v = sample + src_width * abs_src_height +

+            halfwidth * crop_y + crop_x / 2;

+        src_u = sample + src_width * abs_src_height +

+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;

+      } else {

+        src_u = sample + src_width * abs_src_height +

+            halfwidth * crop_y + crop_x / 2;

+        src_v = sample + src_width * abs_src_height +

+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;

+      }

+      r = I422ToI420(src_y, src_width,

+                     src_u, halfwidth,

+                     src_v, halfwidth,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    }

+    case FOURCC_I444:

+    case FOURCC_YV24: {

+      const uint8* src_y = sample + src_width * crop_y + crop_x;

+      const uint8* src_u;

+      const uint8* src_v;

+      if (format == FOURCC_YV24) {

+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;

+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;

+      } else {

+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;

+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;

+      }

+      r = I444ToI420(src_y, src_width,

+                     src_u, src_width,

+                     src_v, src_width,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    }

+    case FOURCC_I411: {

+      int quarterwidth = (src_width + 3) / 4;

+      const uint8* src_y = sample + src_width * crop_y + crop_x;

+      const uint8* src_u = sample + src_width * abs_src_height +

+          quarterwidth * crop_y + crop_x / 4;

+      const uint8* src_v = sample + src_width * abs_src_height +

+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;

+      r = I411ToI420(src_y, src_width,

+                     src_u, quarterwidth,

+                     src_v, quarterwidth,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     crop_width, inv_crop_height);

+      break;

+    }

+#ifdef HAVE_JPEG

+    case FOURCC_MJPG:

+      r = MJPGToI420(sample, sample_size,

+                     y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     src_width, abs_src_height, crop_width, inv_crop_height);

+      break;

+#endif

+    default:

+      r = -1;  // unknown fourcc - return failure code.

+  }

+  if (need_buf) {

+    if (!r) {

+      r = I420Rotate(y, y_stride,

+                     u, u_stride,

+                     v, v_stride,

+                     tmp_y, tmp_y_stride,

+                     tmp_u, tmp_u_stride,

+                     tmp_v, tmp_v_stride,

+                     crop_width, abs_crop_height, rotation);

+    }

+    free(rotate_buffer);

+  }

+  return r;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- a/third_party/libyuv/source/cpu_id.cc

+++ b/third_party/libyuv/source/cpu_id.cc

@@ -8,9 +8,9 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/cpu_id.h"

+#include "libyuv/cpu_id.h"

-#ifdef _MSC_VER

+#if defined(_MSC_VER) && !defined(__clang__)

 #include <intrin.h>  // For __cpuidex()

 #endif

 #if !defined(__pnacl__) && !defined(__CLR_VER) && \

@@ -27,7 +27,7 @@

 #include <stdio.h>

 #include <string.h>

-#include "third_party/libyuv/include/libyuv/basic_types.h"  // For CPU_X86

+#include "libyuv/basic_types.h"  // For CPU_X86

 #ifdef __cplusplus

 namespace libyuv {

@@ -48,7 +48,7 @@

     defined(__i386__) || defined(__x86_64__))

 LIBYUV_API

 void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {

-#if defined(_MSC_VER)

+#if defined(_MSC_VER) && !defined(__clang__)

 #if (_MSC_FULL_VER >= 160040219)

   __cpuidex((int*)(cpu_info), info_eax, info_ecx);

 #elif defined(_M_IX86)

@@ -188,10 +188,14 @@

 int InitCpuFlags(void) {

 #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)

+  uint32 cpu_info0[4] = { 0, 0, 0, 0 };

   uint32 cpu_info1[4] = { 0, 0, 0, 0 };

   uint32 cpu_info7[4] = { 0, 0, 0, 0 };

+  CpuId(0, 0, cpu_info0);

   CpuId(1, 0, cpu_info1);

-  CpuId(7, 0, cpu_info7);

+  if (cpu_info0[0] >= 7) {

+    CpuId(7, 0, cpu_info7);

+  }

   cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |

               ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |

               ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |

@@ -199,6 +203,7 @@

               ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |

               ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |

               kCpuHasX86;

 #ifdef HAS_XGETBV

   if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave

       TestOsSaveYmm()) {  // Saves YMM.

--- /dev/null

+++ b/third_party/libyuv/source/format_conversion.cc

@@ -1,0 +1,552 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/format_conversion.h"

+#include "libyuv/basic_types.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/video_common.h"

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// generate a selector mask useful for pshufb

+static uint32 GenerateSelector(int select0, int select1) {

+  return (uint32)(select0) |

+         (uint32)((select1 + 4) << 8) |

+         (uint32)((select0 + 8) << 16) |

+         (uint32)((select1 + 12) << 24);

+}

+static int MakeSelectors(const int blue_index,

+                         const int green_index,

+                         const int red_index,

+                         uint32 dst_fourcc_bayer,

+                         uint32* index_map) {

+  // Now build a lookup table containing the indices for the four pixels in each

+  // 2x2 Bayer grid.

+  switch (dst_fourcc_bayer) {

+    case FOURCC_BGGR:

+      index_map[0] = GenerateSelector(blue_index, green_index);

+      index_map[1] = GenerateSelector(green_index, red_index);

+      break;

+    case FOURCC_GBRG:

+      index_map[0] = GenerateSelector(green_index, blue_index);

+      index_map[1] = GenerateSelector(red_index, green_index);

+      break;

+    case FOURCC_RGGB:

+      index_map[0] = GenerateSelector(red_index, green_index);

+      index_map[1] = GenerateSelector(green_index, blue_index);

+      break;

+    case FOURCC_GRBG:

+      index_map[0] = GenerateSelector(green_index, red_index);

+      index_map[1] = GenerateSelector(blue_index, green_index);

+      break;

+    default:

+      return -1;  // Bad FourCC

+  }

+  return 0;

+}

+// Converts 32 bit ARGB to Bayer RGB formats.

+LIBYUV_API

+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,

+                uint8* dst_bayer, int dst_stride_bayer,

+                int width, int height,

+                uint32 dst_fourcc_bayer) {

+  int y;

+  const int blue_index = 0;  // Offsets for ARGB format

+  const int green_index = 1;

+  const int red_index = 2;

+  uint32 index_map[2];

+  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,

+                         uint32 selector, int pix) = ARGBToBayerRow_C;

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+#if defined(HAS_ARGBTOBAYERROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToBayerRow = ARGBToBayerRow_SSSE3;

+    }

+  }

+#elif defined(HAS_ARGBTOBAYERROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToBayerRow = ARGBToBayerRow_NEON;

+    }

+  }

+#endif

+  if (MakeSelectors(blue_index, green_index, red_index,

+                    dst_fourcc_bayer, index_map)) {

+    return -1;  // Bad FourCC

+  }

+  for (y = 0; y < height; ++y) {

+    ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);

+    src_argb += src_stride_argb;

+    dst_bayer += dst_stride_bayer;

+  }

+  return 0;

+}

+#define AVG(a, b) (((a) + (b)) >> 1)

+static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,

+                       uint8* dst_argb, int pix) {

+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;

+  uint8 g = src_bayer0[1];

+  uint8 r = src_bayer1[1];

+  int x;

+  for (x = 0; x < pix - 2; x += 2) {

+    dst_argb[0] = src_bayer0[0];

+    dst_argb[1] = AVG(g, src_bayer0[1]);

+    dst_argb[2] = AVG(r, src_bayer1[1]);

+    dst_argb[3] = 255U;

+    dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);

+    dst_argb[5] = src_bayer0[1];

+    dst_argb[6] = src_bayer1[1];

+    dst_argb[7] = 255U;

+    g = src_bayer0[1];

+    r = src_bayer1[1];

+    src_bayer0 += 2;

+    src_bayer1 += 2;

+    dst_argb += 8;

+  }

+  dst_argb[0] = src_bayer0[0];

+  dst_argb[1] = AVG(g, src_bayer0[1]);

+  dst_argb[2] = AVG(r, src_bayer1[1]);

+  dst_argb[3] = 255U;

+  if (!(pix & 1)) {

+    dst_argb[4] = src_bayer0[0];

+    dst_argb[5] = src_bayer0[1];

+    dst_argb[6] = src_bayer1[1];

+    dst_argb[7] = 255U;

+  }

+}

+static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,

+                       uint8* dst_argb, int pix) {

+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;

+  uint8 g = src_bayer0[1];

+  uint8 b = src_bayer1[1];

+  int x;

+  for (x = 0; x < pix - 2; x += 2) {

+    dst_argb[0] = AVG(b, src_bayer1[1]);

+    dst_argb[1] = AVG(g, src_bayer0[1]);

+    dst_argb[2] = src_bayer0[0];

+    dst_argb[3] = 255U;

+    dst_argb[4] = src_bayer1[1];

+    dst_argb[5] = src_bayer0[1];

+    dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);

+    dst_argb[7] = 255U;

+    g = src_bayer0[1];

+    b = src_bayer1[1];

+    src_bayer0 += 2;

+    src_bayer1 += 2;

+    dst_argb += 8;

+  }

+  dst_argb[0] = AVG(b, src_bayer1[1]);

+  dst_argb[1] = AVG(g, src_bayer0[1]);

+  dst_argb[2] = src_bayer0[0];

+  dst_argb[3] = 255U;

+  if (!(pix & 1)) {

+    dst_argb[4] = src_bayer1[1];

+    dst_argb[5] = src_bayer0[1];

+    dst_argb[6] = src_bayer0[0];

+    dst_argb[7] = 255U;

+  }

+}

+static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,

+                       uint8* dst_argb, int pix) {

+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;

+  uint8 b = src_bayer0[1];

+  int x;

+  for (x = 0; x < pix - 2; x += 2) {

+    dst_argb[0] = AVG(b, src_bayer0[1]);

+    dst_argb[1] = src_bayer0[0];

+    dst_argb[2] = src_bayer1[0];

+    dst_argb[3] = 255U;

+    dst_argb[4] = src_bayer0[1];

+    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);

+    dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);

+    dst_argb[7] = 255U;

+    b = src_bayer0[1];

+    src_bayer0 += 2;

+    src_bayer1 += 2;

+    dst_argb += 8;

+  }

+  dst_argb[0] = AVG(b, src_bayer0[1]);

+  dst_argb[1] = src_bayer0[0];

+  dst_argb[2] = src_bayer1[0];

+  dst_argb[3] = 255U;

+  if (!(pix & 1)) {

+    dst_argb[4] = src_bayer0[1];

+    dst_argb[5] = src_bayer0[0];

+    dst_argb[6] = src_bayer1[0];

+    dst_argb[7] = 255U;

+  }

+}

+static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,

+                       uint8* dst_argb, int pix) {

+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;

+  uint8 r = src_bayer0[1];

+  int x;

+  for (x = 0; x < pix - 2; x += 2) {

+    dst_argb[0] = src_bayer1[0];

+    dst_argb[1] = src_bayer0[0];

+    dst_argb[2] = AVG(r, src_bayer0[1]);

+    dst_argb[3] = 255U;

+    dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);

+    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);

+    dst_argb[6] = src_bayer0[1];

+    dst_argb[7] = 255U;

+    r = src_bayer0[1];

+    src_bayer0 += 2;

+    src_bayer1 += 2;

+    dst_argb += 8;

+  }

+  dst_argb[0] = src_bayer1[0];

+  dst_argb[1] = src_bayer0[0];

+  dst_argb[2] = AVG(r, src_bayer0[1]);

+  dst_argb[3] = 255U;

+  if (!(pix & 1)) {

+    dst_argb[4] = src_bayer1[0];

+    dst_argb[5] = src_bayer0[0];

+    dst_argb[6] = src_bayer0[1];

+    dst_argb[7] = 255U;

+  }

+}

+// Converts any Bayer RGB format to ARGB.

+LIBYUV_API

+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,

+                uint8* dst_argb, int dst_stride_argb,

+                int width, int height,

+                uint32 src_fourcc_bayer) {

+  int y;

+  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_argb, int pix);

+  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_argb, int pix);

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  switch (src_fourcc_bayer) {

+    case FOURCC_BGGR:

+      BayerRow0 = BayerRowBG;

+      BayerRow1 = BayerRowGR;

+      break;

+    case FOURCC_GBRG:

+      BayerRow0 = BayerRowGB;

+      BayerRow1 = BayerRowRG;

+      break;

+    case FOURCC_GRBG:

+      BayerRow0 = BayerRowGR;

+      BayerRow1 = BayerRowBG;

+      break;

+    case FOURCC_RGGB:

+      BayerRow0 = BayerRowRG;

+      BayerRow1 = BayerRowGB;

+      break;

+    default:

+      return -1;    // Bad FourCC

+  }

+  for (y = 0; y < height - 1; y += 2) {

+    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);

+    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,

+              dst_argb + dst_stride_argb, width);

+    src_bayer += src_stride_bayer * 2;

+    dst_argb += dst_stride_argb * 2;

+  }

+  if (height & 1) {

+    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);

+  }

+  return 0;

+}

+// Converts any Bayer RGB format to ARGB.

+LIBYUV_API

+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,

+                uint8* dst_y, int dst_stride_y,

+                uint8* dst_u, int dst_stride_u,

+                uint8* dst_v, int dst_stride_v,

+                int width, int height,

+                uint32 src_fourcc_bayer) {

+  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_argb, int pix);

+  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,

+                    uint8* dst_argb, int pix);

+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =

+      ARGBToYRow_C;

+  // Negative height means invert the image.

+  if (height < 0) {

+    int halfheight;

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    dst_y = dst_y + (height - 1) * dst_stride_y;

+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;

+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;

+    dst_stride_y = -dst_stride_y;

+    dst_stride_u = -dst_stride_u;

+    dst_stride_v = -dst_stride_v;

+  }

+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {

+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

+    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;

+      ARGBToUVRow = ARGBToUVRow_SSSE3;

+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+        ARGBToYRow = ARGBToYRow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_ARGBTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToYRow = ARGBToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToYRow = ARGBToYRow_NEON;

+    }

+    if (width >= 16) {

+      ARGBToUVRow = ARGBToUVRow_Any_NEON;

+      if (IS_ALIGNED(width, 16)) {

+        ARGBToUVRow = ARGBToUVRow_NEON;

+      }

+    }

+  }

+#endif

+  switch (src_fourcc_bayer) {

+    case FOURCC_BGGR:

+      BayerRow0 = BayerRowBG;

+      BayerRow1 = BayerRowGR;

+      break;

+    case FOURCC_GBRG:

+      BayerRow0 = BayerRowGB;

+      BayerRow1 = BayerRowRG;

+      break;

+    case FOURCC_GRBG:

+      BayerRow0 = BayerRowGR;

+      BayerRow1 = BayerRowBG;

+      break;

+    case FOURCC_RGGB:

+      BayerRow0 = BayerRowRG;

+      BayerRow1 = BayerRowGB;

+      break;

+    default:

+      return -1;  // Bad FourCC

+  }

+  {

+    // Allocate 2 rows of ARGB.

+    const int kRowSize = (width * 4 + 15) & ~15;

+    align_buffer_64(row, kRowSize * 2);

+    int y;

+    for (y = 0; y < height - 1; y += 2) {

+      BayerRow0(src_bayer, src_stride_bayer, row, width);

+      BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,

+                row + kRowSize, width);

+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);

+      ARGBToYRow(row, dst_y, width);

+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);

+      src_bayer += src_stride_bayer * 2;

+      dst_y += dst_stride_y * 2;

+      dst_u += dst_stride_u;

+      dst_v += dst_stride_v;

+    }

+    if (height & 1) {

+      BayerRow0(src_bayer, src_stride_bayer, row, width);

+      ARGBToUVRow(row, 0, dst_u, dst_v, width);

+      ARGBToYRow(row, dst_y, width);

+    }

+    free_aligned_buffer_64(row);

+  }

+  return 0;

+}

+// Convert I420 to Bayer.

+LIBYUV_API

+int I420ToBayer(const uint8* src_y, int src_stride_y,

+                const uint8* src_u, int src_stride_u,

+                const uint8* src_v, int src_stride_v,

+                uint8* dst_bayer, int dst_stride_bayer,

+                int width, int height,

+                uint32 dst_fourcc_bayer) {

+  void (*I422ToARGBRow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I422ToARGBRow_C;

+  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,

+                         uint32 selector, int pix) = ARGBToBayerRow_C;

+  const int blue_index = 0;  // Offsets for ARGB format

+  const int green_index = 1;

+  const int red_index = 2;

+  uint32 index_map[2];

+  // Negative height means invert the image.

+  if (height < 0) {

+    int halfheight;

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_u = src_u + (halfheight - 1) * src_stride_u;

+    src_v = src_v + (halfheight - 1) * src_stride_v;

+    src_stride_y = -src_stride_y;

+    src_stride_u = -src_stride_u;

+    src_stride_v = -src_stride_v;

+  }

+#if defined(HAS_I422TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {

+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToARGBRow = I422ToARGBRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    I422ToARGBRow = I422ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {

+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;

+  }

+#endif

+#if defined(HAS_ARGBTOBAYERROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToBayerRow = ARGBToBayerRow_SSSE3;

+    }

+  }

+#elif defined(HAS_ARGBTOBAYERROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToBayerRow = ARGBToBayerRow_NEON;

+    }

+  }

+#endif

+  if (MakeSelectors(blue_index, green_index, red_index,

+                    dst_fourcc_bayer, index_map)) {

+    return -1;  // Bad FourCC

+  }

+  {

+    // Allocate a row of ARGB.

+    align_buffer_64(row, width * 4);

+    int y;

+    for (y = 0; y < height; ++y) {

+      I422ToARGBRow(src_y, src_u, src_v, row, width);

+      ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);

+      dst_bayer += dst_stride_bayer;

+      src_y += src_stride_y;

+      if (y & 1) {

+        src_u += src_stride_u;

+        src_v += src_stride_v;

+      }

+    }

+    free_aligned_buffer_64(row);

+  }

+  return 0;

+}

+#define MAKEBAYERFOURCC(BAYER)                                                 \

+LIBYUV_API                                                                     \

+int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer,         \

+                         uint8* dst_y, int dst_stride_y,                       \

+                         uint8* dst_u, int dst_stride_u,                       \

+                         uint8* dst_v, int dst_stride_v,                       \

+                         int width, int height) {                              \

+  return BayerToI420(src_bayer, src_stride_bayer,                              \

+                     dst_y, dst_stride_y,                                      \

+                     dst_u, dst_stride_u,                                      \

+                     dst_v, dst_stride_v,                                      \

+                     width, height,                                            \

+                     FOURCC_##BAYER);                                          \

+}                                                                              \

+                                                                               \

+LIBYUV_API                                                                     \

+int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y,                   \

+                       const uint8* src_u, int src_stride_u,                   \

+                       const uint8* src_v, int src_stride_v,                   \

+                       uint8* dst_bayer, int dst_stride_bayer,                 \

+                       int width, int height) {                                \

+  return I420ToBayer(src_y, src_stride_y,                                      \

+                     src_u, src_stride_u,                                      \

+                     src_v, src_stride_v,                                      \

+                     dst_bayer, dst_stride_bayer,                              \

+                     width, height,                                            \

+                     FOURCC_##BAYER);                                          \

+}                                                                              \

+                                                                               \

+LIBYUV_API                                                                     \

+int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb,             \

+                       uint8* dst_bayer, int dst_stride_bayer,                 \

+                       int width, int height) {                                \

+  return ARGBToBayer(src_argb, src_stride_argb,                                \

+                     dst_bayer, dst_stride_bayer,                              \

+                     width, height,                                            \

+                     FOURCC_##BAYER);                                          \

+}                                                                              \

+                                                                               \

+LIBYUV_API                                                                     \

+int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer,         \

+                         uint8* dst_argb, int dst_stride_argb,                 \

+                         int width, int height) {                              \

+  return BayerToARGB(src_bayer, src_stride_bayer,                              \

+                     dst_argb, dst_stride_argb,                                \

+                     width, height,                                            \

+                     FOURCC_##BAYER);                                          \

+}

+MAKEBAYERFOURCC(BGGR)

+MAKEBAYERFOURCC(GBRG)

+MAKEBAYERFOURCC(GRBG)

+MAKEBAYERFOURCC(RGGB)

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/mjpeg_decoder.cc

@@ -1,0 +1,566 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/mjpeg_decoder.h"

+#ifdef HAVE_JPEG

+#include <assert.h>

+#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\

+    !defined(TARGET_IPHONE_SIMULATOR)

+// Must be included before jpeglib.

+#include <setjmp.h>

+#define HAVE_SETJMP

+#endif

+struct FILE;  // For jpeglib.h.

+// C++ build requires extern C for jpeg internals.

+#ifdef __cplusplus

+extern "C" {

+#endif

+#include <jpeglib.h>

+#ifdef __cplusplus

+}  // extern "C"

+#endif

+#include "libyuv/planar_functions.h"  // For CopyPlane().

+namespace libyuv {

+#ifdef HAVE_SETJMP

+struct SetJmpErrorMgr {

+  jpeg_error_mgr base;  // Must be at the top

+  jmp_buf setjmp_buffer;

+};

+#endif

+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;

+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;

+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;

+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;

+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;

+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;

+// Methods that are passed to jpeglib.

+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);

+void init_source(jpeg_decompress_struct* cinfo);

+void skip_input_data(jpeg_decompress_struct* cinfo,

+                     long num_bytes);  // NOLINT

+void term_source(jpeg_decompress_struct* cinfo);

+void ErrorHandler(jpeg_common_struct* cinfo);

+MJpegDecoder::MJpegDecoder()

+    : has_scanline_padding_(LIBYUV_FALSE),

+      num_outbufs_(0),

+      scanlines_(NULL),

+      scanlines_sizes_(NULL),

+      databuf_(NULL),

+      databuf_strides_(NULL) {

+  decompress_struct_ = new jpeg_decompress_struct;

+  source_mgr_ = new jpeg_source_mgr;

+#ifdef HAVE_SETJMP

+  error_mgr_ = new SetJmpErrorMgr;

+  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);

+  // Override standard exit()-based error handler.

+  error_mgr_->base.error_exit = &ErrorHandler;

+#endif

+  decompress_struct_->client_data = NULL;

+  source_mgr_->init_source = &init_source;

+  source_mgr_->fill_input_buffer = &fill_input_buffer;

+  source_mgr_->skip_input_data = &skip_input_data;

+  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;

+  source_mgr_->term_source = &term_source;

+  jpeg_create_decompress(decompress_struct_);

+  decompress_struct_->src = source_mgr_;

+  buf_vec_.buffers = &buf_;

+  buf_vec_.len = 1;

+}

+MJpegDecoder::~MJpegDecoder() {

+  jpeg_destroy_decompress(decompress_struct_);

+  delete decompress_struct_;

+  delete source_mgr_;

+#ifdef HAVE_SETJMP

+  delete error_mgr_;

+#endif

+  DestroyOutputBuffers();

+}

+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {

+  if (!ValidateJpeg(src, src_len)) {

+    return LIBYUV_FALSE;

+  }

+  buf_.data = src;

+  buf_.len = (int)(src_len);

+  buf_vec_.pos = 0;

+  decompress_struct_->client_data = &buf_vec_;

+#ifdef HAVE_SETJMP

+  if (setjmp(error_mgr_->setjmp_buffer)) {

+    // We called jpeg_read_header, it experienced an error, and we called

+    // longjmp() and rewound the stack to here. Return error.

+    return LIBYUV_FALSE;

+  }

+#endif

+  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {

+    // ERROR: Bad MJPEG header

+    return LIBYUV_FALSE;

+  }

+  AllocOutputBuffers(GetNumComponents());

+  for (int i = 0; i < num_outbufs_; ++i) {

+    int scanlines_size = GetComponentScanlinesPerImcuRow(i);

+    if (scanlines_sizes_[i] != scanlines_size) {

+      if (scanlines_[i]) {

+        delete scanlines_[i];

+      }

+      scanlines_[i] = new uint8* [scanlines_size];

+      scanlines_sizes_[i] = scanlines_size;

+    }

+    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes

+    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For

+    // the preceding scanlines, the padding is not needed/wanted because the

+    // following addresses will already be valid (they are the initial bytes of

+    // the next scanline) and will be overwritten when jpeglib writes out that

+    // next scanline.

+    int databuf_stride = GetComponentStride(i);

+    int databuf_size = scanlines_size * databuf_stride;

+    if (databuf_strides_[i] != databuf_stride) {

+      if (databuf_[i]) {

+        delete databuf_[i];

+      }

+      databuf_[i] = new uint8[databuf_size];

+      databuf_strides_[i] = databuf_stride;

+    }

+    if (GetComponentStride(i) != GetComponentWidth(i)) {

+      has_scanline_padding_ = LIBYUV_TRUE;

+    }

+  }

+  return LIBYUV_TRUE;

+}

+static int DivideAndRoundUp(int numerator, int denominator) {

+  return (numerator + denominator - 1) / denominator;

+}

+static int DivideAndRoundDown(int numerator, int denominator) {

+  return numerator / denominator;

+}

+// Returns width of the last loaded frame.

+int MJpegDecoder::GetWidth() {

+  return decompress_struct_->image_width;

+}

+// Returns height of the last loaded frame.

+int MJpegDecoder::GetHeight() {

+  return decompress_struct_->image_height;

+}

+// Returns format of the last loaded frame. The return value is one of the

+// kColorSpace* constants.

+int MJpegDecoder::GetColorSpace() {

+  return decompress_struct_->jpeg_color_space;

+}

+// Number of color components in the color space.

+int MJpegDecoder::GetNumComponents() {

+  return decompress_struct_->num_components;

+}

+// Sample factors of the n-th component.

+int MJpegDecoder::GetHorizSampFactor(int component) {

+  return decompress_struct_->comp_info[component].h_samp_factor;

+}

+int MJpegDecoder::GetVertSampFactor(int component) {

+  return decompress_struct_->comp_info[component].v_samp_factor;

+}

+int MJpegDecoder::GetHorizSubSampFactor(int component) {

+  return decompress_struct_->max_h_samp_factor /

+      GetHorizSampFactor(component);

+}

+int MJpegDecoder::GetVertSubSampFactor(int component) {

+  return decompress_struct_->max_v_samp_factor /

+      GetVertSampFactor(component);

+}

+int MJpegDecoder::GetImageScanlinesPerImcuRow() {

+  return decompress_struct_->max_v_samp_factor * DCTSIZE;

+}

+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {

+  int vs = GetVertSubSampFactor(component);

+  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);

+}

+int MJpegDecoder::GetComponentWidth(int component) {

+  int hs = GetHorizSubSampFactor(component);

+  return DivideAndRoundUp(GetWidth(), hs);

+}

+int MJpegDecoder::GetComponentHeight(int component) {

+  int vs = GetVertSubSampFactor(component);

+  return DivideAndRoundUp(GetHeight(), vs);

+}

+// Get width in bytes padded out to a multiple of DCTSIZE

+int MJpegDecoder::GetComponentStride(int component) {

+  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);

+}

+int MJpegDecoder::GetComponentSize(int component) {

+  return GetComponentWidth(component) * GetComponentHeight(component);

+}

+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {

+#ifdef HAVE_SETJMP

+  if (setjmp(error_mgr_->setjmp_buffer)) {

+    // We called jpeg_abort_decompress, it experienced an error, and we called

+    // longjmp() and rewound the stack to here. Return error.

+    return LIBYUV_FALSE;

+  }

+#endif

+  jpeg_abort_decompress(decompress_struct_);

+  return LIBYUV_TRUE;

+}

+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.

+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(

+    uint8** planes, int dst_width, int dst_height) {

+  if (dst_width != GetWidth() ||

+      dst_height > GetHeight()) {

+    // ERROR: Bad dimensions

+    return LIBYUV_FALSE;

+  }

+#ifdef HAVE_SETJMP

+  if (setjmp(error_mgr_->setjmp_buffer)) {

+    // We called into jpeglib, it experienced an error sometime during this

+    // function call, and we called longjmp() and rewound the stack to here.

+    // Return error.

+    return LIBYUV_FALSE;

+  }

+#endif

+  if (!StartDecode()) {

+    return LIBYUV_FALSE;

+  }

+  SetScanlinePointers(databuf_);

+  int lines_left = dst_height;

+  // Compute amount of lines to skip to implement vertical crop.

+  // TODO(fbarchard): Ensure skip is a multiple of maximum component

+  // subsample. ie 2

+  int skip = (GetHeight() - dst_height) / 2;

+  if (skip > 0) {

+    // There is no API to skip lines in the output data, so we read them

+    // into the temp buffer.

+    while (skip >= GetImageScanlinesPerImcuRow()) {

+      if (!DecodeImcuRow()) {

+        FinishDecode();

+        return LIBYUV_FALSE;

+      }

+      skip -= GetImageScanlinesPerImcuRow();

+    }

+    if (skip > 0) {

+      // Have a partial iMCU row left over to skip. Must read it and then

+      // copy the parts we want into the destination.

+      if (!DecodeImcuRow()) {

+        FinishDecode();

+        return LIBYUV_FALSE;

+      }

+      for (int i = 0; i < num_outbufs_; ++i) {

+        // TODO(fbarchard): Compute skip to avoid this

+        assert(skip % GetVertSubSampFactor(i) == 0);

+        int rows_to_skip =

+            DivideAndRoundDown(skip, GetVertSubSampFactor(i));

+        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -

+                                rows_to_skip;

+        int data_to_skip = rows_to_skip * GetComponentStride(i);

+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),

+                  planes[i], GetComponentWidth(i),

+                  GetComponentWidth(i), scanlines_to_copy);

+        planes[i] += scanlines_to_copy * GetComponentWidth(i);

+      }

+      lines_left -= (GetImageScanlinesPerImcuRow() - skip);

+    }

+  }

+  // Read full MCUs but cropped horizontally

+  for (; lines_left > GetImageScanlinesPerImcuRow();

+         lines_left -= GetImageScanlinesPerImcuRow()) {

+    if (!DecodeImcuRow()) {

+      FinishDecode();

+      return LIBYUV_FALSE;

+    }

+    for (int i = 0; i < num_outbufs_; ++i) {

+      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);

+      CopyPlane(databuf_[i], GetComponentStride(i),

+                planes[i], GetComponentWidth(i),

+                GetComponentWidth(i), scanlines_to_copy);

+      planes[i] += scanlines_to_copy * GetComponentWidth(i);

+    }

+  }

+  if (lines_left > 0) {

+    // Have a partial iMCU row left over to decode.

+    if (!DecodeImcuRow()) {

+      FinishDecode();

+      return LIBYUV_FALSE;

+    }

+    for (int i = 0; i < num_outbufs_; ++i) {

+      int scanlines_to_copy =

+          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));

+      CopyPlane(databuf_[i], GetComponentStride(i),

+                planes[i], GetComponentWidth(i),

+                GetComponentWidth(i), scanlines_to_copy);

+      planes[i] += scanlines_to_copy * GetComponentWidth(i);

+    }

+  }

+  return FinishDecode();

+}

+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,

+    int dst_width, int dst_height) {

+  if (dst_width != GetWidth() ||

+      dst_height > GetHeight()) {

+    // ERROR: Bad dimensions

+    return LIBYUV_FALSE;

+  }

+#ifdef HAVE_SETJMP

+  if (setjmp(error_mgr_->setjmp_buffer)) {

+    // We called into jpeglib, it experienced an error sometime during this

+    // function call, and we called longjmp() and rewound the stack to here.

+    // Return error.

+    return LIBYUV_FALSE;

+  }

+#endif

+  if (!StartDecode()) {

+    return LIBYUV_FALSE;

+  }

+  SetScanlinePointers(databuf_);

+  int lines_left = dst_height;

+  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop

+  int skip = (GetHeight() - dst_height) / 2;

+  if (skip > 0) {

+    while (skip >= GetImageScanlinesPerImcuRow()) {

+      if (!DecodeImcuRow()) {

+        FinishDecode();

+        return LIBYUV_FALSE;

+      }

+      skip -= GetImageScanlinesPerImcuRow();

+    }

+    if (skip > 0) {

+      // Have a partial iMCU row left over to skip.

+      if (!DecodeImcuRow()) {

+        FinishDecode();

+        return LIBYUV_FALSE;

+      }

+      for (int i = 0; i < num_outbufs_; ++i) {

+        // TODO(fbarchard): Compute skip to avoid this

+        assert(skip % GetVertSubSampFactor(i) == 0);

+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));

+        int data_to_skip = rows_to_skip * GetComponentStride(i);

+        // Change our own data buffer pointers so we can pass them to the

+        // callback.

+        databuf_[i] += data_to_skip;

+      }

+      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;

+      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);

+      // Now change them back.

+      for (int i = 0; i < num_outbufs_; ++i) {

+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));

+        int data_to_skip = rows_to_skip * GetComponentStride(i);

+        databuf_[i] -= data_to_skip;

+      }

+      lines_left -= scanlines_to_copy;

+    }

+  }

+  // Read full MCUs until we get to the crop point.

+  for (; lines_left >= GetImageScanlinesPerImcuRow();

+         lines_left -= GetImageScanlinesPerImcuRow()) {

+    if (!DecodeImcuRow()) {

+      FinishDecode();

+      return LIBYUV_FALSE;

+    }

+    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());

+  }

+  if (lines_left > 0) {

+    // Have a partial iMCU row left over to decode.

+    if (!DecodeImcuRow()) {

+      FinishDecode();

+      return LIBYUV_FALSE;

+    }

+    (*fn)(opaque, databuf_, databuf_strides_, lines_left);

+  }

+  return FinishDecode();

+}

+void init_source(j_decompress_ptr cinfo) {

+  fill_input_buffer(cinfo);

+}

+boolean fill_input_buffer(j_decompress_ptr cinfo) {

+  BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);

+  if (buf_vec->pos >= buf_vec->len) {

+    assert(0 && "No more data");

+    // ERROR: No more data

+    return FALSE;

+  }

+  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;

+  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;

+  ++buf_vec->pos;

+  return TRUE;

+}

+void skip_input_data(j_decompress_ptr cinfo,

+                     long num_bytes) {  // NOLINT

+  cinfo->src->next_input_byte += num_bytes;

+}

+void term_source(j_decompress_ptr cinfo) {

+  // Nothing to do.

+}

+#ifdef HAVE_SETJMP

+void ErrorHandler(j_common_ptr cinfo) {

+  // This is called when a jpeglib command experiences an error. Unfortunately

+  // jpeglib's error handling model is not very flexible, because it expects the

+  // error handler to not return--i.e., it wants the program to terminate. To

+  // recover from errors we use setjmp() as shown in their example. setjmp() is

+  // C's implementation for the "call with current continuation" functionality

+  // seen in some functional programming languages.

+  // A formatted message can be output, but is unsafe for release.

+#ifdef DEBUG

+  char buf[JMSG_LENGTH_MAX];

+  (*cinfo->err->format_message)(cinfo, buf);

+  // ERROR: Error in jpeglib: buf

+#endif

+  SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);

+  // This rewinds the call stack to the point of the corresponding setjmp()

+  // and causes it to return (for a second time) with value 1.

+  longjmp(mgr->setjmp_buffer, 1);

+}

+#endif

+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {

+  if (num_outbufs != num_outbufs_) {

+    // We could perhaps optimize this case to resize the output buffers without

+    // necessarily having to delete and recreate each one, but it's not worth

+    // it.

+    DestroyOutputBuffers();

+    scanlines_ = new uint8** [num_outbufs];

+    scanlines_sizes_ = new int[num_outbufs];

+    databuf_ = new uint8* [num_outbufs];

+    databuf_strides_ = new int[num_outbufs];

+    for (int i = 0; i < num_outbufs; ++i) {

+      scanlines_[i] = NULL;

+      scanlines_sizes_[i] = 0;

+      databuf_[i] = NULL;

+      databuf_strides_[i] = 0;

+    }

+    num_outbufs_ = num_outbufs;

+  }

+}

+void MJpegDecoder::DestroyOutputBuffers() {

+  for (int i = 0; i < num_outbufs_; ++i) {

+    delete [] scanlines_[i];

+    delete [] databuf_[i];

+  }

+  delete [] scanlines_;

+  delete [] databuf_;

+  delete [] scanlines_sizes_;

+  delete [] databuf_strides_;

+  scanlines_ = NULL;

+  databuf_ = NULL;

+  scanlines_sizes_ = NULL;

+  databuf_strides_ = NULL;

+  num_outbufs_ = 0;

+}

+// JDCT_IFAST and do_block_smoothing improve performance substantially.

+LIBYUV_BOOL MJpegDecoder::StartDecode() {

+  decompress_struct_->raw_data_out = TRUE;

+  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default

+  decompress_struct_->dither_mode = JDITHER_NONE;

+  // Not applicable to 'raw':

+  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);

+  // Only for buffered mode:

+  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);

+  // Blocky but fast:

+  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);

+  if (!jpeg_start_decompress(decompress_struct_)) {

+    // ERROR: Couldn't start JPEG decompressor";

+    return LIBYUV_FALSE;

+  }

+  return LIBYUV_TRUE;

+}

+LIBYUV_BOOL MJpegDecoder::FinishDecode() {

+  // jpeglib considers it an error if we finish without decoding the whole

+  // image, so we call "abort" rather than "finish".

+  jpeg_abort_decompress(decompress_struct_);

+  return LIBYUV_TRUE;

+}

+void MJpegDecoder::SetScanlinePointers(uint8** data) {

+  for (int i = 0; i < num_outbufs_; ++i) {

+    uint8* data_i = data[i];

+    for (int j = 0; j < scanlines_sizes_[i]; ++j) {

+      scanlines_[i][j] = data_i;

+      data_i += GetComponentStride(i);

+    }

+  }

+}

+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {

+  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==

+      jpeg_read_raw_data(decompress_struct_,

+                         scanlines_,

+                         GetImageScanlinesPerImcuRow());

+}

+// The helper function which recognizes the jpeg sub-sampling type.

+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(

+    int* subsample_x, int* subsample_y, int number_of_components) {

+  if (number_of_components == 3) {  // Color images.

+    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&

+        subsample_x[1] == 2 && subsample_y[1] == 2 &&

+        subsample_x[2] == 2 && subsample_y[2] == 2) {

+      return kJpegYuv420;

+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&

+        subsample_x[1] == 2 && subsample_y[1] == 1 &&

+        subsample_x[2] == 2 && subsample_y[2] == 1) {

+      return kJpegYuv422;

+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&

+        subsample_x[1] == 1 && subsample_y[1] == 1 &&

+        subsample_x[2] == 1 && subsample_y[2] == 1) {

+      return kJpegYuv444;

+    }

+  } else if (number_of_components == 1) {  // Grey-scale images.

+    if (subsample_x[0] == 1 && subsample_y[0] == 1) {

+      return kJpegYuv400;

+    }

+  }

+  return kJpegUnknown;

+}

+}  // namespace libyuv

+#endif  // HAVE_JPEG

--- /dev/null

+++ b/third_party/libyuv/source/mjpeg_validate.cc

@@ -1,0 +1,47 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/mjpeg_decoder.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Helper function to validate the jpeg appears intact.

+// TODO(fbarchard): Optimize case where SOI is found but EOI is not.

+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {

+  size_t i;

+  if (sample_size < 64) {

+    // ERROR: Invalid jpeg size: sample_size

+    return LIBYUV_FALSE;

+  }

+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // Start Of Image

+    // ERROR: Invalid jpeg initial start code

+    return LIBYUV_FALSE;

+  }

+  for (i = sample_size - 2; i > 1;) {

+    if (sample[i] != 0xd9) {

+      if (sample[i] == 0xff && sample[i + 1] == 0xd9) {  // End Of Image

+        return LIBYUV_TRUE;  // Success: Valid jpeg.

+      }

+      --i;

+    }

+    --i;

+  }

+  // ERROR: Invalid jpeg end code not found. Size sample_size

+  return LIBYUV_FALSE;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- a/third_party/libyuv/source/planar_functions.cc

+++ b/third_party/libyuv/source/planar_functions.cc

@@ -8,15 +8,15 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/planar_functions.h"

+#include "libyuv/planar_functions.h"

 #include <string.h>  // for memset()

-#include "third_party/libyuv/include/libyuv/cpu_id.h"

+#include "libyuv/cpu_id.h"

 #ifdef HAVE_JPEG

-#include "third_party/libyuv/include/libyuv/mjpeg_decoder.h"

+#include "libyuv/mjpeg_decoder.h"

 #endif

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -36,6 +36,10 @@

     width *= height;

     height = 1;

     src_stride_y = dst_stride_y = 0;

+  }

+  // Nothing to do.

+  if (src_y == dst_y && src_stride_y == dst_stride_y) {

+    return;

 #if defined(HAS_COPYROW_X86)

   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {

--- /dev/null

+++ b/third_party/libyuv/source/rotate.cc

@@ -1,0 +1,1301 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/rotate.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/convert.h"

+#include "libyuv/planar_functions.h"

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

+#if defined(__APPLE__) && defined(__i386__)

+#define DECLARE_FUNCTION(name)                                                 \

+    ".text                                     \n"                             \

+    ".private_extern _" #name "                \n"                             \

+    ".align 4,0x90                             \n"                             \

+"_" #name ":                                   \n"

+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)

+#define DECLARE_FUNCTION(name)                                                 \

+    ".text                                     \n"                             \

+    ".align 4,0x90                             \n"                             \

+"_" #name ":                                   \n"

+#else

+#define DECLARE_FUNCTION(name)                                                 \

+    ".text                                     \n"                             \

+    ".align 4,0x90                             \n"                             \

+#name ":                                       \n"

+#endif

+#endif

+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \

+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))

+#define HAS_MIRRORROW_NEON

+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);

+#define HAS_MIRRORROW_UV_NEON

+void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);

+#define HAS_TRANSPOSE_WX8_NEON

+void TransposeWx8_NEON(const uint8* src, int src_stride,

+                       uint8* dst, int dst_stride, int width);

+#define HAS_TRANSPOSE_UVWX8_NEON

+void TransposeUVWx8_NEON(const uint8* src, int src_stride,

+                         uint8* dst_a, int dst_stride_a,

+                         uint8* dst_b, int dst_stride_b,

+                         int width);

+#endif  // defined(__ARM_NEON__)

+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \

+    defined(__mips__) && \

+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)

+#define HAS_TRANSPOSE_WX8_MIPS_DSPR2

+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,

+                             uint8* dst, int dst_stride, int width);

+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,

+                                  uint8* dst, int dst_stride, int width);

+#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2

+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,

+                               uint8* dst_a, int dst_stride_a,

+                               uint8* dst_b, int dst_stride_b,

+                               int width);

+#endif  // defined(__mips__)

+#if !defined(LIBYUV_DISABLE_X86) && \

+    defined(_M_IX86) && defined(_MSC_VER)

+#define HAS_TRANSPOSE_WX8_SSSE3

+__declspec(naked) __declspec(align(16))

+static void TransposeWx8_SSSE3(const uint8* src, int src_stride,

+                               uint8* dst, int dst_stride, int width) {

+  __asm {

+    push      edi

+    push      esi

+    push      ebp

+    mov       eax, [esp + 12 + 4]   // src

+    mov       edi, [esp + 12 + 8]   // src_stride

+    mov       edx, [esp + 12 + 12]  // dst

+    mov       esi, [esp + 12 + 16]  // dst_stride

+    mov       ecx, [esp + 12 + 20]  // width

+    // Read in the data from the source pointer.

+    // First round of bit swap.

+    align      4

+ convertloop:

+    movq      xmm0, qword ptr [eax]

+    lea       ebp, [eax + 8]

+    movq      xmm1, qword ptr [eax + edi]

+    lea       eax, [eax + 2 * edi]

+    punpcklbw xmm0, xmm1

+    movq      xmm2, qword ptr [eax]

+    movdqa    xmm1, xmm0

+    palignr   xmm1, xmm1, 8

+    movq      xmm3, qword ptr [eax + edi]

+    lea       eax, [eax + 2 * edi]

+    punpcklbw xmm2, xmm3

+    movdqa    xmm3, xmm2

+    movq      xmm4, qword ptr [eax]

+    palignr   xmm3, xmm3, 8

+    movq      xmm5, qword ptr [eax + edi]

+    punpcklbw xmm4, xmm5

+    lea       eax, [eax + 2 * edi]

+    movdqa    xmm5, xmm4

+    movq      xmm6, qword ptr [eax]

+    palignr   xmm5, xmm5, 8

+    movq      xmm7, qword ptr [eax + edi]

+    punpcklbw xmm6, xmm7

+    mov       eax, ebp

+    movdqa    xmm7, xmm6

+    palignr   xmm7, xmm7, 8

+    // Second round of bit swap.

+    punpcklwd xmm0, xmm2

+    punpcklwd xmm1, xmm3

+    movdqa    xmm2, xmm0

+    movdqa    xmm3, xmm1

+    palignr   xmm2, xmm2, 8

+    palignr   xmm3, xmm3, 8

+    punpcklwd xmm4, xmm6

+    punpcklwd xmm5, xmm7

+    movdqa    xmm6, xmm4

+    movdqa    xmm7, xmm5

+    palignr   xmm6, xmm6, 8

+    palignr   xmm7, xmm7, 8

+    // Third round of bit swap.

+    // Write to the destination pointer.

+    punpckldq xmm0, xmm4

+    movq      qword ptr [edx], xmm0

+    movdqa    xmm4, xmm0

+    palignr   xmm4, xmm4, 8

+    movq      qword ptr [edx + esi], xmm4

+    lea       edx, [edx + 2 * esi]

+    punpckldq xmm2, xmm6

+    movdqa    xmm6, xmm2

+    palignr   xmm6, xmm6, 8

+    movq      qword ptr [edx], xmm2

+    punpckldq xmm1, xmm5

+    movq      qword ptr [edx + esi], xmm6

+    lea       edx, [edx + 2 * esi]

+    movdqa    xmm5, xmm1

+    movq      qword ptr [edx], xmm1

+    palignr   xmm5, xmm5, 8

+    punpckldq xmm3, xmm7

+    movq      qword ptr [edx + esi], xmm5

+    lea       edx, [edx + 2 * esi]

+    movq      qword ptr [edx], xmm3

+    movdqa    xmm7, xmm3

+    palignr   xmm7, xmm7, 8

+    sub       ecx, 8

+    movq      qword ptr [edx + esi], xmm7

+    lea       edx, [edx + 2 * esi]

+    jg        convertloop

+    pop       ebp

+    pop       esi

+    pop       edi

+    ret

+  }

+}

+#define HAS_TRANSPOSE_UVWX8_SSE2

+__declspec(naked) __declspec(align(16))

+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

+                                uint8* dst_a, int dst_stride_a,

+                                uint8* dst_b, int dst_stride_b,

+                                int w) {

+  __asm {

+    push      ebx

+    push      esi

+    push      edi

+    push      ebp

+    mov       eax, [esp + 16 + 4]   // src

+    mov       edi, [esp + 16 + 8]   // src_stride

+    mov       edx, [esp + 16 + 12]  // dst_a

+    mov       esi, [esp + 16 + 16]  // dst_stride_a

+    mov       ebx, [esp + 16 + 20]  // dst_b

+    mov       ebp, [esp + 16 + 24]  // dst_stride_b

+    mov       ecx, esp

+    sub       esp, 4 + 16

+    and       esp, ~15

+    mov       [esp + 16], ecx

+    mov       ecx, [ecx + 16 + 28]  // w

+    align      4

+ convertloop:

+    // Read in the data from the source pointer.

+    // First round of bit swap.

+    movdqa    xmm0, [eax]

+    movdqa    xmm1, [eax + edi]

+    lea       eax, [eax + 2 * edi]

+    movdqa    xmm7, xmm0  // use xmm7 as temp register.

+    punpcklbw xmm0, xmm1

+    punpckhbw xmm7, xmm1

+    movdqa    xmm1, xmm7

+    movdqa    xmm2, [eax]

+    movdqa    xmm3, [eax + edi]

+    lea       eax, [eax + 2 * edi]

+    movdqa    xmm7, xmm2

+    punpcklbw xmm2, xmm3

+    punpckhbw xmm7, xmm3

+    movdqa    xmm3, xmm7

+    movdqa    xmm4, [eax]

+    movdqa    xmm5, [eax + edi]

+    lea       eax, [eax + 2 * edi]

+    movdqa    xmm7, xmm4

+    punpcklbw xmm4, xmm5

+    punpckhbw xmm7, xmm5

+    movdqa    xmm5, xmm7

+    movdqa    xmm6, [eax]

+    movdqa    xmm7, [eax + edi]

+    lea       eax, [eax + 2 * edi]

+    movdqa    [esp], xmm5  // backup xmm5

+    neg       edi

+    movdqa    xmm5, xmm6   // use xmm5 as temp register.

+    punpcklbw xmm6, xmm7

+    punpckhbw xmm5, xmm7

+    movdqa    xmm7, xmm5

+    lea       eax, [eax + 8 * edi + 16]

+    neg       edi

+    // Second round of bit swap.

+    movdqa    xmm5, xmm0

+    punpcklwd xmm0, xmm2

+    punpckhwd xmm5, xmm2

+    movdqa    xmm2, xmm5

+    movdqa    xmm5, xmm1

+    punpcklwd xmm1, xmm3

+    punpckhwd xmm5, xmm3

+    movdqa    xmm3, xmm5

+    movdqa    xmm5, xmm4

+    punpcklwd xmm4, xmm6

+    punpckhwd xmm5, xmm6

+    movdqa    xmm6, xmm5

+    movdqa    xmm5, [esp]  // restore xmm5

+    movdqa    [esp], xmm6  // backup xmm6

+    movdqa    xmm6, xmm5    // use xmm6 as temp register.

+    punpcklwd xmm5, xmm7

+    punpckhwd xmm6, xmm7

+    movdqa    xmm7, xmm6

+    // Third round of bit swap.

+    // Write to the destination pointer.

+    movdqa    xmm6, xmm0

+    punpckldq xmm0, xmm4

+    punpckhdq xmm6, xmm4

+    movdqa    xmm4, xmm6

+    movdqa    xmm6, [esp]  // restore xmm6

+    movlpd    qword ptr [edx], xmm0

+    movhpd    qword ptr [ebx], xmm0

+    movlpd    qword ptr [edx + esi], xmm4

+    lea       edx, [edx + 2 * esi]

+    movhpd    qword ptr [ebx + ebp], xmm4

+    lea       ebx, [ebx + 2 * ebp]

+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.

+    punpckldq xmm2, xmm6

+    movlpd    qword ptr [edx], xmm2

+    movhpd    qword ptr [ebx], xmm2

+    punpckhdq xmm0, xmm6

+    movlpd    qword ptr [edx + esi], xmm0

+    lea       edx, [edx + 2 * esi]

+    movhpd    qword ptr [ebx + ebp], xmm0

+    lea       ebx, [ebx + 2 * ebp]

+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.

+    punpckldq xmm1, xmm5

+    movlpd    qword ptr [edx], xmm1

+    movhpd    qword ptr [ebx], xmm1

+    punpckhdq xmm0, xmm5

+    movlpd    qword ptr [edx + esi], xmm0

+    lea       edx, [edx + 2 * esi]

+    movhpd    qword ptr [ebx + ebp], xmm0

+    lea       ebx, [ebx + 2 * ebp]

+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.

+    punpckldq xmm3, xmm7

+    movlpd    qword ptr [edx], xmm3

+    movhpd    qword ptr [ebx], xmm3

+    punpckhdq xmm0, xmm7

+    sub       ecx, 8

+    movlpd    qword ptr [edx + esi], xmm0

+    lea       edx, [edx + 2 * esi]

+    movhpd    qword ptr [ebx + ebp], xmm0

+    lea       ebx, [ebx + 2 * ebp]

+    jg        convertloop

+    mov       esp, [esp + 16]

+    pop       ebp

+    pop       edi

+    pop       esi

+    pop       ebx

+    ret

+  }

+}

+#elif !defined(LIBYUV_DISABLE_X86) && \

+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))

+#define HAS_TRANSPOSE_WX8_SSSE3

+static void TransposeWx8_SSSE3(const uint8* src, int src_stride,

+                               uint8* dst, int dst_stride, int width) {

+  asm volatile (

+    // Read in the data from the source pointer.

+    // First round of bit swap.

+    ".p2align  2                                 \n"

+  "1:                                            \n"

+    "movq       (%0),%%xmm0                      \n"

+    "movq       (%0,%3),%%xmm1                   \n"

+    "lea        (%0,%3,2),%0                     \n"

+    "punpcklbw  %%xmm1,%%xmm0                    \n"

+    "movq       (%0),%%xmm2                      \n"

+    "movdqa     %%xmm0,%%xmm1                    \n"

+    "palignr    $0x8,%%xmm1,%%xmm1               \n"

+    "movq       (%0,%3),%%xmm3                   \n"

+    "lea        (%0,%3,2),%0                     \n"

+    "punpcklbw  %%xmm3,%%xmm2                    \n"

+    "movdqa     %%xmm2,%%xmm3                    \n"

+    "movq       (%0),%%xmm4                      \n"

+    "palignr    $0x8,%%xmm3,%%xmm3               \n"

+    "movq       (%0,%3),%%xmm5                   \n"

+    "lea        (%0,%3,2),%0                     \n"

+    "punpcklbw  %%xmm5,%%xmm4                    \n"

+    "movdqa     %%xmm4,%%xmm5                    \n"

+    "movq       (%0),%%xmm6                      \n"

+    "palignr    $0x8,%%xmm5,%%xmm5               \n"

+    "movq       (%0,%3),%%xmm7                   \n"

+    "lea        (%0,%3,2),%0                     \n"

+    "punpcklbw  %%xmm7,%%xmm6                    \n"

+    "neg        %3                               \n"

+    "movdqa     %%xmm6,%%xmm7                    \n"

+    "lea        0x8(%0,%3,8),%0                  \n"

+    "palignr    $0x8,%%xmm7,%%xmm7               \n"

+    "neg        %3                               \n"

+     // Second round of bit swap.

+    "punpcklwd  %%xmm2,%%xmm0                    \n"

+    "punpcklwd  %%xmm3,%%xmm1                    \n"

+    "movdqa     %%xmm0,%%xmm2                    \n"

+    "movdqa     %%xmm1,%%xmm3                    \n"

+    "palignr    $0x8,%%xmm2,%%xmm2               \n"

+    "palignr    $0x8,%%xmm3,%%xmm3               \n"

+    "punpcklwd  %%xmm6,%%xmm4                    \n"

+    "punpcklwd  %%xmm7,%%xmm5                    \n"

+    "movdqa     %%xmm4,%%xmm6                    \n"

+    "movdqa     %%xmm5,%%xmm7                    \n"

+    "palignr    $0x8,%%xmm6,%%xmm6               \n"

+    "palignr    $0x8,%%xmm7,%%xmm7               \n"

+    // Third round of bit swap.

+    // Write to the destination pointer.

+    "punpckldq  %%xmm4,%%xmm0                    \n"

+    "movq       %%xmm0,(%1)                      \n"

+    "movdqa     %%xmm0,%%xmm4                    \n"

+    "palignr    $0x8,%%xmm4,%%xmm4               \n"

+    "movq       %%xmm4,(%1,%4)                   \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "punpckldq  %%xmm6,%%xmm2                    \n"

+    "movdqa     %%xmm2,%%xmm6                    \n"

+    "movq       %%xmm2,(%1)                      \n"

+    "palignr    $0x8,%%xmm6,%%xmm6               \n"

+    "punpckldq  %%xmm5,%%xmm1                    \n"

+    "movq       %%xmm6,(%1,%4)                   \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "movdqa     %%xmm1,%%xmm5                    \n"

+    "movq       %%xmm1,(%1)                      \n"

+    "palignr    $0x8,%%xmm5,%%xmm5               \n"

+    "movq       %%xmm5,(%1,%4)                   \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "punpckldq  %%xmm7,%%xmm3                    \n"

+    "movq       %%xmm3,(%1)                      \n"

+    "movdqa     %%xmm3,%%xmm7                    \n"

+    "palignr    $0x8,%%xmm7,%%xmm7               \n"

+    "sub        $0x8,%2                          \n"

+    "movq       %%xmm7,(%1,%4)                   \n"

+    "lea        (%1,%4,2),%1                     \n"

+    "jg         1b                               \n"

+    : "+r"(src),    // %0

+      "+r"(dst),    // %1

+      "+r"(width)   // %2

+    : "r"((intptr_t)(src_stride)),  // %3

+      "r"((intptr_t)(dst_stride))   // %4

+    : "memory", "cc"

+  #if defined(__SSE2__)

+      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+  #endif

+  );

+}

+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)

+#define HAS_TRANSPOSE_UVWX8_SSE2

+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

+                         uint8* dst_a, int dst_stride_a,

+                         uint8* dst_b, int dst_stride_b,

+                         int w);

+  asm (

+    DECLARE_FUNCTION(TransposeUVWx8_SSE2)

+    "push   %ebx                               \n"

+    "push   %esi                               \n"

+    "push   %edi                               \n"

+    "push   %ebp                               \n"

+    "mov    0x14(%esp),%eax                    \n"

+    "mov    0x18(%esp),%edi                    \n"

+    "mov    0x1c(%esp),%edx                    \n"

+    "mov    0x20(%esp),%esi                    \n"

+    "mov    0x24(%esp),%ebx                    \n"

+    "mov    0x28(%esp),%ebp                    \n"

+    "mov    %esp,%ecx                          \n"

+    "sub    $0x14,%esp                         \n"

+    "and    $0xfffffff0,%esp                   \n"

+    "mov    %ecx,0x10(%esp)                    \n"

+    "mov    0x2c(%ecx),%ecx                    \n"

+"1:                                            \n"

+    "movdqa (%eax),%xmm0                       \n"

+    "movdqa (%eax,%edi,1),%xmm1                \n"

+    "lea    (%eax,%edi,2),%eax                 \n"

+    "movdqa %xmm0,%xmm7                        \n"

+    "punpcklbw %xmm1,%xmm0                     \n"

+    "punpckhbw %xmm1,%xmm7                     \n"

+    "movdqa %xmm7,%xmm1                        \n"

+    "movdqa (%eax),%xmm2                       \n"

+    "movdqa (%eax,%edi,1),%xmm3                \n"

+    "lea    (%eax,%edi,2),%eax                 \n"

+    "movdqa %xmm2,%xmm7                        \n"

+    "punpcklbw %xmm3,%xmm2                     \n"

+    "punpckhbw %xmm3,%xmm7                     \n"

+    "movdqa %xmm7,%xmm3                        \n"

+    "movdqa (%eax),%xmm4                       \n"

+    "movdqa (%eax,%edi,1),%xmm5                \n"

+    "lea    (%eax,%edi,2),%eax                 \n"

+    "movdqa %xmm4,%xmm7                        \n"

+    "punpcklbw %xmm5,%xmm4                     \n"

+    "punpckhbw %xmm5,%xmm7                     \n"

+    "movdqa %xmm7,%xmm5                        \n"

+    "movdqa (%eax),%xmm6                       \n"

+    "movdqa (%eax,%edi,1),%xmm7                \n"

+    "lea    (%eax,%edi,2),%eax                 \n"

+    "movdqa %xmm5,(%esp)                       \n"

+    "neg    %edi                               \n"

+    "movdqa %xmm6,%xmm5                        \n"

+    "punpcklbw %xmm7,%xmm6                     \n"

+    "punpckhbw %xmm7,%xmm5                     \n"

+    "movdqa %xmm5,%xmm7                        \n"

+    "lea    0x10(%eax,%edi,8),%eax             \n"

+    "neg    %edi                               \n"

+    "movdqa %xmm0,%xmm5                        \n"

+    "punpcklwd %xmm2,%xmm0                     \n"

+    "punpckhwd %xmm2,%xmm5                     \n"

+    "movdqa %xmm5,%xmm2                        \n"

+    "movdqa %xmm1,%xmm5                        \n"

+    "punpcklwd %xmm3,%xmm1                     \n"

+    "punpckhwd %xmm3,%xmm5                     \n"

+    "movdqa %xmm5,%xmm3                        \n"

+    "movdqa %xmm4,%xmm5                        \n"

+    "punpcklwd %xmm6,%xmm4                     \n"

+    "punpckhwd %xmm6,%xmm5                     \n"

+    "movdqa %xmm5,%xmm6                        \n"

+    "movdqa (%esp),%xmm5                       \n"

+    "movdqa %xmm6,(%esp)                       \n"

+    "movdqa %xmm5,%xmm6                        \n"

+    "punpcklwd %xmm7,%xmm5                     \n"

+    "punpckhwd %xmm7,%xmm6                     \n"

+    "movdqa %xmm6,%xmm7                        \n"

+    "movdqa %xmm0,%xmm6                        \n"

+    "punpckldq %xmm4,%xmm0                     \n"

+    "punpckhdq %xmm4,%xmm6                     \n"

+    "movdqa %xmm6,%xmm4                        \n"

+    "movdqa (%esp),%xmm6                       \n"

+    "movlpd %xmm0,(%edx)                       \n"

+    "movhpd %xmm0,(%ebx)                       \n"

+    "movlpd %xmm4,(%edx,%esi,1)                \n"

+    "lea    (%edx,%esi,2),%edx                 \n"

+    "movhpd %xmm4,(%ebx,%ebp,1)                \n"

+    "lea    (%ebx,%ebp,2),%ebx                 \n"

+    "movdqa %xmm2,%xmm0                        \n"

+    "punpckldq %xmm6,%xmm2                     \n"

+    "movlpd %xmm2,(%edx)                       \n"

+    "movhpd %xmm2,(%ebx)                       \n"

+    "punpckhdq %xmm6,%xmm0                     \n"

+    "movlpd %xmm0,(%edx,%esi,1)                \n"

+    "lea    (%edx,%esi,2),%edx                 \n"

+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"

+    "lea    (%ebx,%ebp,2),%ebx                 \n"

+    "movdqa %xmm1,%xmm0                        \n"

+    "punpckldq %xmm5,%xmm1                     \n"

+    "movlpd %xmm1,(%edx)                       \n"

+    "movhpd %xmm1,(%ebx)                       \n"

+    "punpckhdq %xmm5,%xmm0                     \n"

+    "movlpd %xmm0,(%edx,%esi,1)                \n"

+    "lea    (%edx,%esi,2),%edx                 \n"

+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"

+    "lea    (%ebx,%ebp,2),%ebx                 \n"

+    "movdqa %xmm3,%xmm0                        \n"

+    "punpckldq %xmm7,%xmm3                     \n"

+    "movlpd %xmm3,(%edx)                       \n"

+    "movhpd %xmm3,(%ebx)                       \n"

+    "punpckhdq %xmm7,%xmm0                     \n"

+    "sub    $0x8,%ecx                          \n"

+    "movlpd %xmm0,(%edx,%esi,1)                \n"

+    "lea    (%edx,%esi,2),%edx                 \n"

+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"

+    "lea    (%ebx,%ebp,2),%ebx                 \n"

+    "jg     1b                                 \n"

+    "mov    0x10(%esp),%esp                    \n"

+    "pop    %ebp                               \n"

+    "pop    %edi                               \n"

+    "pop    %esi                               \n"

+    "pop    %ebx                               \n"

+#if defined(__native_client__)

+    "pop    %ecx                               \n"

+    "and    $0xffffffe0,%ecx                   \n"

+    "jmp    *%ecx                              \n"

+#else

+    "ret                                       \n"

+#endif

+);

+#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \

+    defined(__x86_64__)

+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.

+#define HAS_TRANSPOSE_WX8_FAST_SSSE3

+static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,

+                                    uint8* dst, int dst_stride, int width) {

+  asm volatile (

+  // Read in the data from the source pointer.

+  // First round of bit swap.

+  ".p2align  2                                 \n"

+"1:                                            \n"

+  "movdqa     (%0),%%xmm0                      \n"

+  "movdqa     (%0,%3),%%xmm1                   \n"

+  "lea        (%0,%3,2),%0                     \n"

+  "movdqa     %%xmm0,%%xmm8                    \n"

+  "punpcklbw  %%xmm1,%%xmm0                    \n"

+  "punpckhbw  %%xmm1,%%xmm8                    \n"

+  "movdqa     (%0),%%xmm2                      \n"

+  "movdqa     %%xmm0,%%xmm1                    \n"

+  "movdqa     %%xmm8,%%xmm9                    \n"

+  "palignr    $0x8,%%xmm1,%%xmm1               \n"

+  "palignr    $0x8,%%xmm9,%%xmm9               \n"

+  "movdqa     (%0,%3),%%xmm3                   \n"

+  "lea        (%0,%3,2),%0                     \n"

+  "movdqa     %%xmm2,%%xmm10                   \n"

+  "punpcklbw  %%xmm3,%%xmm2                    \n"

+  "punpckhbw  %%xmm3,%%xmm10                   \n"

+  "movdqa     %%xmm2,%%xmm3                    \n"

+  "movdqa     %%xmm10,%%xmm11                  \n"

+  "movdqa     (%0),%%xmm4                      \n"

+  "palignr    $0x8,%%xmm3,%%xmm3               \n"

+  "palignr    $0x8,%%xmm11,%%xmm11             \n"

+  "movdqa     (%0,%3),%%xmm5                   \n"

+  "lea        (%0,%3,2),%0                     \n"

+  "movdqa     %%xmm4,%%xmm12                   \n"

+  "punpcklbw  %%xmm5,%%xmm4                    \n"

+  "punpckhbw  %%xmm5,%%xmm12                   \n"

+  "movdqa     %%xmm4,%%xmm5                    \n"

+  "movdqa     %%xmm12,%%xmm13                  \n"

+  "movdqa     (%0),%%xmm6                      \n"

+  "palignr    $0x8,%%xmm5,%%xmm5               \n"

+  "palignr    $0x8,%%xmm13,%%xmm13             \n"

+  "movdqa     (%0,%3),%%xmm7                   \n"

+  "lea        (%0,%3,2),%0                     \n"

+  "movdqa     %%xmm6,%%xmm14                   \n"

+  "punpcklbw  %%xmm7,%%xmm6                    \n"

+  "punpckhbw  %%xmm7,%%xmm14                   \n"

+  "neg        %3                               \n"

+  "movdqa     %%xmm6,%%xmm7                    \n"

+  "movdqa     %%xmm14,%%xmm15                  \n"

+  "lea        0x10(%0,%3,8),%0                 \n"

+  "palignr    $0x8,%%xmm7,%%xmm7               \n"

+  "palignr    $0x8,%%xmm15,%%xmm15             \n"

+  "neg        %3                               \n"

+   // Second round of bit swap.

+  "punpcklwd  %%xmm2,%%xmm0                    \n"

+  "punpcklwd  %%xmm3,%%xmm1                    \n"

+  "movdqa     %%xmm0,%%xmm2                    \n"

+  "movdqa     %%xmm1,%%xmm3                    \n"

+  "palignr    $0x8,%%xmm2,%%xmm2               \n"

+  "palignr    $0x8,%%xmm3,%%xmm3               \n"

+  "punpcklwd  %%xmm6,%%xmm4                    \n"

+  "punpcklwd  %%xmm7,%%xmm5                    \n"

+  "movdqa     %%xmm4,%%xmm6                    \n"

+  "movdqa     %%xmm5,%%xmm7                    \n"

+  "palignr    $0x8,%%xmm6,%%xmm6               \n"

+  "palignr    $0x8,%%xmm7,%%xmm7               \n"

+  "punpcklwd  %%xmm10,%%xmm8                   \n"

+  "punpcklwd  %%xmm11,%%xmm9                   \n"

+  "movdqa     %%xmm8,%%xmm10                   \n"

+  "movdqa     %%xmm9,%%xmm11                   \n"

+  "palignr    $0x8,%%xmm10,%%xmm10             \n"

+  "palignr    $0x8,%%xmm11,%%xmm11             \n"

+  "punpcklwd  %%xmm14,%%xmm12                  \n"

+  "punpcklwd  %%xmm15,%%xmm13                  \n"

+  "movdqa     %%xmm12,%%xmm14                  \n"

+  "movdqa     %%xmm13,%%xmm15                  \n"

+  "palignr    $0x8,%%xmm14,%%xmm14             \n"

+  "palignr    $0x8,%%xmm15,%%xmm15             \n"

+  // Third round of bit swap.

+  // Write to the destination pointer.

+  "punpckldq  %%xmm4,%%xmm0                    \n"

+  "movq       %%xmm0,(%1)                      \n"

+  "movdqa     %%xmm0,%%xmm4                    \n"

+  "palignr    $0x8,%%xmm4,%%xmm4               \n"

+  "movq       %%xmm4,(%1,%4)                   \n"

+  "lea        (%1,%4,2),%1                     \n"

+  "punpckldq  %%xmm6,%%xmm2                    \n"

+  "movdqa     %%xmm2,%%xmm6                    \n"

+  "movq       %%xmm2,(%1)                      \n"

+  "palignr    $0x8,%%xmm6,%%xmm6               \n"

+  "punpckldq  %%xmm5,%%xmm1                    \n"

+  "movq       %%xmm6,(%1,%4)                   \n"

+  "lea        (%1,%4,2),%1                     \n"

+  "movdqa     %%xmm1,%%xmm5                    \n"

+  "movq       %%xmm1,(%1)                      \n"

+  "palignr    $0x8,%%xmm5,%%xmm5               \n"

+  "movq       %%xmm5,(%1,%4)                   \n"

+  "lea        (%1,%4,2),%1                     \n"

+  "punpckldq  %%xmm7,%%xmm3                    \n"

+  "movq       %%xmm3,(%1)                      \n"

+  "movdqa     %%xmm3,%%xmm7                    \n"

+  "palignr    $0x8,%%xmm7,%%xmm7               \n"

+  "movq       %%xmm7,(%1,%4)                   \n"

+  "lea        (%1,%4,2),%1                     \n"

+  "punpckldq  %%xmm12,%%xmm8                   \n"

+  "movq       %%xmm8,(%1)                      \n"

+  "movdqa     %%xmm8,%%xmm12                   \n"

+  "palignr    $0x8,%%xmm12,%%xmm12             \n"

+  "movq       %%xmm12,(%1,%4)                  \n"

+  "lea        (%1,%4,2),%1                     \n"

+  "punpckldq  %%xmm14,%%xmm10                  \n"

+  "movdqa     %%xmm10,%%xmm14                  \n"

+  "movq       %%xmm10,(%1)                     \n"

+  "palignr    $0x8,%%xmm14,%%xmm14             \n"

+  "punpckldq  %%xmm13,%%xmm9                   \n"

+  "movq       %%xmm14,(%1,%4)                  \n"

+  "lea        (%1,%4,2),%1                     \n"

+  "movdqa     %%xmm9,%%xmm13                   \n"

+  "movq       %%xmm9,(%1)                      \n"

+  "palignr    $0x8,%%xmm13,%%xmm13             \n"

+  "movq       %%xmm13,(%1,%4)                  \n"

+  "lea        (%1,%4,2),%1                     \n"

+  "punpckldq  %%xmm15,%%xmm11                  \n"

+  "movq       %%xmm11,(%1)                     \n"

+  "movdqa     %%xmm11,%%xmm15                  \n"

+  "palignr    $0x8,%%xmm15,%%xmm15             \n"

+  "sub        $0x10,%2                         \n"

+  "movq       %%xmm15,(%1,%4)                  \n"

+  "lea        (%1,%4,2),%1                     \n"

+  "jg         1b                               \n"

+  : "+r"(src),    // %0

+    "+r"(dst),    // %1

+    "+r"(width)   // %2

+  : "r"((intptr_t)(src_stride)),  // %3

+    "r"((intptr_t)(dst_stride))   // %4

+  : "memory", "cc",

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",

+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"

+);

+}

+#define HAS_TRANSPOSE_UVWX8_SSE2

+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

+                                uint8* dst_a, int dst_stride_a,

+                                uint8* dst_b, int dst_stride_b,

+                                int w) {

+  asm volatile (

+  // Read in the data from the source pointer.

+  // First round of bit swap.

+  ".p2align  2                                 \n"

+"1:                                            \n"

+  "movdqa     (%0),%%xmm0                      \n"

+  "movdqa     (%0,%4),%%xmm1                   \n"

+  "lea        (%0,%4,2),%0                     \n"

+  "movdqa     %%xmm0,%%xmm8                    \n"

+  "punpcklbw  %%xmm1,%%xmm0                    \n"

+  "punpckhbw  %%xmm1,%%xmm8                    \n"

+  "movdqa     %%xmm8,%%xmm1                    \n"

+  "movdqa     (%0),%%xmm2                      \n"

+  "movdqa     (%0,%4),%%xmm3                   \n"

+  "lea        (%0,%4,2),%0                     \n"

+  "movdqa     %%xmm2,%%xmm8                    \n"

+  "punpcklbw  %%xmm3,%%xmm2                    \n"

+  "punpckhbw  %%xmm3,%%xmm8                    \n"

+  "movdqa     %%xmm8,%%xmm3                    \n"

+  "movdqa     (%0),%%xmm4                      \n"

+  "movdqa     (%0,%4),%%xmm5                   \n"

+  "lea        (%0,%4,2),%0                     \n"

+  "movdqa     %%xmm4,%%xmm8                    \n"

+  "punpcklbw  %%xmm5,%%xmm4                    \n"

+  "punpckhbw  %%xmm5,%%xmm8                    \n"

+  "movdqa     %%xmm8,%%xmm5                    \n"

+  "movdqa     (%0),%%xmm6                      \n"

+  "movdqa     (%0,%4),%%xmm7                   \n"

+  "lea        (%0,%4,2),%0                     \n"

+  "movdqa     %%xmm6,%%xmm8                    \n"

+  "punpcklbw  %%xmm7,%%xmm6                    \n"

+  "neg        %4                               \n"

+  "lea        0x10(%0,%4,8),%0                 \n"

+  "punpckhbw  %%xmm7,%%xmm8                    \n"

+  "movdqa     %%xmm8,%%xmm7                    \n"

+  "neg        %4                               \n"

+   // Second round of bit swap.

+  "movdqa     %%xmm0,%%xmm8                    \n"

+  "movdqa     %%xmm1,%%xmm9                    \n"

+  "punpckhwd  %%xmm2,%%xmm8                    \n"

+  "punpckhwd  %%xmm3,%%xmm9                    \n"

+  "punpcklwd  %%xmm2,%%xmm0                    \n"

+  "punpcklwd  %%xmm3,%%xmm1                    \n"

+  "movdqa     %%xmm8,%%xmm2                    \n"

+  "movdqa     %%xmm9,%%xmm3                    \n"

+  "movdqa     %%xmm4,%%xmm8                    \n"

+  "movdqa     %%xmm5,%%xmm9                    \n"

+  "punpckhwd  %%xmm6,%%xmm8                    \n"

+  "punpckhwd  %%xmm7,%%xmm9                    \n"

+  "punpcklwd  %%xmm6,%%xmm4                    \n"

+  "punpcklwd  %%xmm7,%%xmm5                    \n"

+  "movdqa     %%xmm8,%%xmm6                    \n"

+  "movdqa     %%xmm9,%%xmm7                    \n"

+  // Third round of bit swap.

+  // Write to the destination pointer.

+  "movdqa     %%xmm0,%%xmm8                    \n"

+  "punpckldq  %%xmm4,%%xmm0                    \n"

+  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel

+  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel

+  "punpckhdq  %%xmm4,%%xmm8                    \n"

+  "movlpd     %%xmm8,(%1,%5)                   \n"

+  "lea        (%1,%5,2),%1                     \n"

+  "movhpd     %%xmm8,(%2,%6)                   \n"

+  "lea        (%2,%6,2),%2                     \n"

+  "movdqa     %%xmm2,%%xmm8                    \n"

+  "punpckldq  %%xmm6,%%xmm2                    \n"

+  "movlpd     %%xmm2,(%1)                      \n"

+  "movhpd     %%xmm2,(%2)                      \n"

+  "punpckhdq  %%xmm6,%%xmm8                    \n"

+  "movlpd     %%xmm8,(%1,%5)                   \n"

+  "lea        (%1,%5,2),%1                     \n"

+  "movhpd     %%xmm8,(%2,%6)                   \n"

+  "lea        (%2,%6,2),%2                     \n"

+  "movdqa     %%xmm1,%%xmm8                    \n"

+  "punpckldq  %%xmm5,%%xmm1                    \n"

+  "movlpd     %%xmm1,(%1)                      \n"

+  "movhpd     %%xmm1,(%2)                      \n"

+  "punpckhdq  %%xmm5,%%xmm8                    \n"

+  "movlpd     %%xmm8,(%1,%5)                   \n"

+  "lea        (%1,%5,2),%1                     \n"

+  "movhpd     %%xmm8,(%2,%6)                   \n"

+  "lea        (%2,%6,2),%2                     \n"

+  "movdqa     %%xmm3,%%xmm8                    \n"

+  "punpckldq  %%xmm7,%%xmm3                    \n"

+  "movlpd     %%xmm3,(%1)                      \n"

+  "movhpd     %%xmm3,(%2)                      \n"

+  "punpckhdq  %%xmm7,%%xmm8                    \n"

+  "sub        $0x8,%3                          \n"

+  "movlpd     %%xmm8,(%1,%5)                   \n"

+  "lea        (%1,%5,2),%1                     \n"

+  "movhpd     %%xmm8,(%2,%6)                   \n"

+  "lea        (%2,%6,2),%2                     \n"

+  "jg         1b                               \n"

+  : "+r"(src),    // %0

+    "+r"(dst_a),  // %1

+    "+r"(dst_b),  // %2

+    "+r"(w)   // %3

+  : "r"((intptr_t)(src_stride)),    // %4

+    "r"((intptr_t)(dst_stride_a)),  // %5

+    "r"((intptr_t)(dst_stride_b))   // %6

+  : "memory", "cc",

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",

+    "xmm8", "xmm9"

+);

+}

+#endif

+#endif

+static void TransposeWx8_C(const uint8* src, int src_stride,

+                           uint8* dst, int dst_stride,

+                           int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    dst[0] = src[0 * src_stride];

+    dst[1] = src[1 * src_stride];

+    dst[2] = src[2 * src_stride];

+    dst[3] = src[3 * src_stride];

+    dst[4] = src[4 * src_stride];

+    dst[5] = src[5 * src_stride];

+    dst[6] = src[6 * src_stride];

+    dst[7] = src[7 * src_stride];

+    ++src;

+    dst += dst_stride;

+  }

+}

+static void TransposeWxH_C(const uint8* src, int src_stride,

+                           uint8* dst, int dst_stride,

+                           int width, int height) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    int j;

+    for (j = 0; j < height; ++j) {

+      dst[i * dst_stride + j] = src[j * src_stride + i];

+    }

+  }

+}

+LIBYUV_API

+void TransposePlane(const uint8* src, int src_stride,

+                    uint8* dst, int dst_stride,

+                    int width, int height) {

+  int i = height;

+  void (*TransposeWx8)(const uint8* src, int src_stride,

+                       uint8* dst, int dst_stride,

+                       int width) = TransposeWx8_C;

+#if defined(HAS_TRANSPOSE_WX8_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    TransposeWx8 = TransposeWx8_NEON;

+  }

+#endif

+#if defined(HAS_TRANSPOSE_WX8_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {

+    TransposeWx8 = TransposeWx8_SSSE3;

+  }

+#endif

+#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) &&

+      IS_ALIGNED(width, 16) &&

+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {

+    TransposeWx8 = TransposeWx8_FAST_SSSE3;

+  }

+#endif

+#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {

+    if (IS_ALIGNED(width, 4) &&

+        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

+      TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;

+    } else {

+      TransposeWx8 = TransposeWx8_MIPS_DSPR2;

+    }

+  }

+#endif

+  // Work across the source in 8x8 tiles

+  while (i >= 8) {

+    TransposeWx8(src, src_stride, dst, dst_stride, width);

+    src += 8 * src_stride;    // Go down 8 rows.

+    dst += 8;                 // Move over 8 columns.

+    i -= 8;

+  }

+  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);

+}

+LIBYUV_API

+void RotatePlane90(const uint8* src, int src_stride,

+                   uint8* dst, int dst_stride,

+                   int width, int height) {

+  // Rotate by 90 is a transpose with the source read

+  // from bottom to top. So set the source pointer to the end

+  // of the buffer and flip the sign of the source stride.

+  src += src_stride * (height - 1);

+  src_stride = -src_stride;

+  TransposePlane(src, src_stride, dst, dst_stride, width, height);

+}

+LIBYUV_API

+void RotatePlane270(const uint8* src, int src_stride,

+                    uint8* dst, int dst_stride,

+                    int width, int height) {

+  // Rotate by 270 is a transpose with the destination written

+  // from bottom to top. So set the destination pointer to the end

+  // of the buffer and flip the sign of the destination stride.

+  dst += dst_stride * (width - 1);

+  dst_stride = -dst_stride;

+  TransposePlane(src, src_stride, dst, dst_stride, width, height);

+}

+LIBYUV_API

+void RotatePlane180(const uint8* src, int src_stride,

+                    uint8* dst, int dst_stride,

+                    int width, int height) {

+  // Swap first and last row and mirror the content. Uses a temporary row.

+  align_buffer_64(row, width);

+  const uint8* src_bot = src + src_stride * (height - 1);

+  uint8* dst_bot = dst + dst_stride * (height - 1);

+  int half_height = (height + 1) >> 1;

+  int y;

+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;

+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;

+#if defined(HAS_MIRRORROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {

+    MirrorRow = MirrorRow_NEON;

+  }

+#endif

+#if defined(HAS_MIRRORROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&

+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&

+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {

+    MirrorRow = MirrorRow_SSE2;

+  }

+#endif

+#if defined(HAS_MIRRORROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&

+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&

+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {

+    MirrorRow = MirrorRow_SSSE3;

+  }

+#endif

+#if defined(HAS_MIRRORROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {

+    MirrorRow = MirrorRow_AVX2;

+  }

+#endif

+#if defined(HAS_MIRRORROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&

+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {

+    MirrorRow = MirrorRow_MIPS_DSPR2;

+  }

+#endif

+#if defined(HAS_COPYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {

+    CopyRow = CopyRow_NEON;

+  }

+#endif

+#if defined(HAS_COPYROW_X86)

+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {

+    CopyRow = CopyRow_X86;

+  }

+#endif

+#if defined(HAS_COPYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&

+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&

+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {

+    CopyRow = CopyRow_SSE2;

+  }

+#endif

+#if defined(HAS_COPYROW_ERMS)

+  if (TestCpuFlag(kCpuHasERMS)) {

+    CopyRow = CopyRow_ERMS;

+  }

+#endif

+#if defined(HAS_COPYROW_MIPS)

+  if (TestCpuFlag(kCpuHasMIPS)) {

+    CopyRow = CopyRow_MIPS;

+  }

+#endif

+  // Odd height will harmlessly mirror the middle row twice.

+  for (y = 0; y < half_height; ++y) {

+    MirrorRow(src, row, width);  // Mirror first row into a buffer

+    src += src_stride;

+    MirrorRow(src_bot, dst, width);  // Mirror last row into first row

+    dst += dst_stride;

+    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last

+    src_bot -= src_stride;

+    dst_bot -= dst_stride;

+  }

+  free_aligned_buffer_64(row);

+}

+static void TransposeUVWx8_C(const uint8* src, int src_stride,

+                             uint8* dst_a, int dst_stride_a,

+                             uint8* dst_b, int dst_stride_b,

+                             int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    dst_a[0] = src[0 * src_stride + 0];

+    dst_b[0] = src[0 * src_stride + 1];

+    dst_a[1] = src[1 * src_stride + 0];

+    dst_b[1] = src[1 * src_stride + 1];

+    dst_a[2] = src[2 * src_stride + 0];

+    dst_b[2] = src[2 * src_stride + 1];

+    dst_a[3] = src[3 * src_stride + 0];

+    dst_b[3] = src[3 * src_stride + 1];

+    dst_a[4] = src[4 * src_stride + 0];

+    dst_b[4] = src[4 * src_stride + 1];

+    dst_a[5] = src[5 * src_stride + 0];

+    dst_b[5] = src[5 * src_stride + 1];

+    dst_a[6] = src[6 * src_stride + 0];

+    dst_b[6] = src[6 * src_stride + 1];

+    dst_a[7] = src[7 * src_stride + 0];

+    dst_b[7] = src[7 * src_stride + 1];

+    src += 2;

+    dst_a += dst_stride_a;

+    dst_b += dst_stride_b;

+  }

+}

+static void TransposeUVWxH_C(const uint8* src, int src_stride,

+                             uint8* dst_a, int dst_stride_a,

+                             uint8* dst_b, int dst_stride_b,

+                             int width, int height) {

+  int i;

+  for (i = 0; i < width * 2; i += 2) {

+    int j;

+    for (j = 0; j < height; ++j) {

+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];

+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];

+    }

+  }

+}

+LIBYUV_API

+void TransposeUV(const uint8* src, int src_stride,

+                 uint8* dst_a, int dst_stride_a,

+                 uint8* dst_b, int dst_stride_b,

+                 int width, int height) {

+  int i = height;

+  void (*TransposeUVWx8)(const uint8* src, int src_stride,

+                         uint8* dst_a, int dst_stride_a,

+                         uint8* dst_b, int dst_stride_b,

+                         int width) = TransposeUVWx8_C;

+#if defined(HAS_TRANSPOSE_UVWX8_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    TransposeUVWx8 = TransposeUVWx8_NEON;

+  }

+#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) &&

+      IS_ALIGNED(width, 8) &&

+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {

+    TransposeUVWx8 = TransposeUVWx8_SSE2;

+  }

+#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&

+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

+    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;

+  }

+#endif

+  // Work through the source in 8x8 tiles.

+  while (i >= 8) {

+    TransposeUVWx8(src, src_stride,

+                   dst_a, dst_stride_a,

+                   dst_b, dst_stride_b,

+                   width);

+    src += 8 * src_stride;    // Go down 8 rows.

+    dst_a += 8;               // Move over 8 columns.

+    dst_b += 8;               // Move over 8 columns.

+    i -= 8;

+  }

+  TransposeUVWxH_C(src, src_stride,

+                   dst_a, dst_stride_a,

+                   dst_b, dst_stride_b,

+                   width, i);

+}

+LIBYUV_API

+void RotateUV90(const uint8* src, int src_stride,

+                uint8* dst_a, int dst_stride_a,

+                uint8* dst_b, int dst_stride_b,

+                int width, int height) {

+  src += src_stride * (height - 1);

+  src_stride = -src_stride;

+  TransposeUV(src, src_stride,

+              dst_a, dst_stride_a,

+              dst_b, dst_stride_b,

+              width, height);

+}

+LIBYUV_API

+void RotateUV270(const uint8* src, int src_stride,

+                 uint8* dst_a, int dst_stride_a,

+                 uint8* dst_b, int dst_stride_b,

+                 int width, int height) {

+  dst_a += dst_stride_a * (width - 1);

+  dst_b += dst_stride_b * (width - 1);

+  dst_stride_a = -dst_stride_a;

+  dst_stride_b = -dst_stride_b;

+  TransposeUV(src, src_stride,

+              dst_a, dst_stride_a,

+              dst_b, dst_stride_b,

+              width, height);

+}

+// Rotate 180 is a horizontal and vertical flip.

+LIBYUV_API

+void RotateUV180(const uint8* src, int src_stride,

+                 uint8* dst_a, int dst_stride_a,

+                 uint8* dst_b, int dst_stride_b,

+                 int width, int height) {

+  int i;

+  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =

+      MirrorUVRow_C;

+#if defined(HAS_MIRRORUVROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

+    MirrorRowUV = MirrorUVRow_NEON;

+  }

+#elif defined(HAS_MIRRORROW_UV_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&

+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {

+    MirrorRowUV = MirrorUVRow_SSSE3;

+  }

+#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

+    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;

+  }

+#endif

+  dst_a += dst_stride_a * (height - 1);

+  dst_b += dst_stride_b * (height - 1);

+  for (i = 0; i < height; ++i) {

+    MirrorRowUV(src, dst_a, dst_b, width);

+    src += src_stride;

+    dst_a -= dst_stride_a;

+    dst_b -= dst_stride_b;

+  }

+}

+LIBYUV_API

+int RotatePlane(const uint8* src, int src_stride,

+                uint8* dst, int dst_stride,

+                int width, int height,

+                enum RotationMode mode) {

+  if (!src || width <= 0 || height == 0 || !dst) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src = src + (height - 1) * src_stride;

+    src_stride = -src_stride;

+  }

+  switch (mode) {

+    case kRotate0:

+      // copy frame

+      CopyPlane(src, src_stride,

+                dst, dst_stride,

+                width, height);

+      return 0;

+    case kRotate90:

+      RotatePlane90(src, src_stride,

+                    dst, dst_stride,

+                    width, height);

+      return 0;

+    case kRotate270:

+      RotatePlane270(src, src_stride,

+                     dst, dst_stride,

+                     width, height);

+      return 0;

+    case kRotate180:

+      RotatePlane180(src, src_stride,

+                     dst, dst_stride,

+                     width, height);

+      return 0;

+    default:

+      break;

+  }

+  return -1;

+}

+LIBYUV_API

+int I420Rotate(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height,

+               enum RotationMode mode) {

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||

+      !dst_y || !dst_u || !dst_v) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_u = src_u + (halfheight - 1) * src_stride_u;

+    src_v = src_v + (halfheight - 1) * src_stride_v;

+    src_stride_y = -src_stride_y;

+    src_stride_u = -src_stride_u;

+    src_stride_v = -src_stride_v;

+  }

+  switch (mode) {

+    case kRotate0:

+      // copy frame

+      return I420Copy(src_y, src_stride_y,

+                      src_u, src_stride_u,

+                      src_v, src_stride_v,

+                      dst_y, dst_stride_y,

+                      dst_u, dst_stride_u,

+                      dst_v, dst_stride_v,

+                      width, height);

+    case kRotate90:

+      RotatePlane90(src_y, src_stride_y,

+                    dst_y, dst_stride_y,

+                    width, height);

+      RotatePlane90(src_u, src_stride_u,

+                    dst_u, dst_stride_u,

+                    halfwidth, halfheight);

+      RotatePlane90(src_v, src_stride_v,

+                    dst_v, dst_stride_v,

+                    halfwidth, halfheight);

+      return 0;

+    case kRotate270:

+      RotatePlane270(src_y, src_stride_y,

+                     dst_y, dst_stride_y,

+                     width, height);

+      RotatePlane270(src_u, src_stride_u,

+                     dst_u, dst_stride_u,

+                     halfwidth, halfheight);

+      RotatePlane270(src_v, src_stride_v,

+                     dst_v, dst_stride_v,

+                     halfwidth, halfheight);

+      return 0;

+    case kRotate180:

+      RotatePlane180(src_y, src_stride_y,

+                     dst_y, dst_stride_y,

+                     width, height);

+      RotatePlane180(src_u, src_stride_u,

+                     dst_u, dst_stride_u,

+                     halfwidth, halfheight);

+      RotatePlane180(src_v, src_stride_v,

+                     dst_v, dst_stride_v,

+                     halfwidth, halfheight);

+      return 0;

+    default:

+      break;

+  }

+  return -1;

+}

+LIBYUV_API

+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,

+                     const uint8* src_uv, int src_stride_uv,

+                     uint8* dst_y, int dst_stride_y,

+                     uint8* dst_u, int dst_stride_u,

+                     uint8* dst_v, int dst_stride_v,

+                     int width, int height,

+                     enum RotationMode mode) {

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src_y || !src_uv || width <= 0 || height == 0 ||

+      !dst_y || !dst_u || !dst_v) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;

+    src_stride_y = -src_stride_y;

+    src_stride_uv = -src_stride_uv;

+  }

+  switch (mode) {

+    case kRotate0:

+      // copy frame

+      return NV12ToI420(src_y, src_stride_y,

+                        src_uv, src_stride_uv,

+                        dst_y, dst_stride_y,

+                        dst_u, dst_stride_u,

+                        dst_v, dst_stride_v,

+                        width, height);

+    case kRotate90:

+      RotatePlane90(src_y, src_stride_y,

+                    dst_y, dst_stride_y,

+                    width, height);

+      RotateUV90(src_uv, src_stride_uv,

+                 dst_u, dst_stride_u,

+                 dst_v, dst_stride_v,

+                 halfwidth, halfheight);

+      return 0;

+    case kRotate270:

+      RotatePlane270(src_y, src_stride_y,

+                     dst_y, dst_stride_y,

+                     width, height);

+      RotateUV270(src_uv, src_stride_uv,

+                  dst_u, dst_stride_u,

+                  dst_v, dst_stride_v,

+                  halfwidth, halfheight);

+      return 0;

+    case kRotate180:

+      RotatePlane180(src_y, src_stride_y,

+                     dst_y, dst_stride_y,

+                     width, height);

+      RotateUV180(src_uv, src_stride_uv,

+                  dst_u, dst_stride_u,

+                  dst_v, dst_stride_v,

+                  halfwidth, halfheight);

+      return 0;

+    default:

+      break;

+  }

+  return -1;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/rotate_argb.cc

@@ -1,0 +1,209 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/rotate.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/convert.h"

+#include "libyuv/planar_functions.h"

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// ARGBScale has a function to copy pixels to a row, striding each source

+// pixel by a constant.

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(_M_IX86) || \

+    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))

+#define HAS_SCALEARGBROWDOWNEVEN_SSE2

+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,

+                               int src_stepx,

+                               uint8* dst_ptr, int dst_width);

+#endif

+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \

+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))

+#define HAS_SCALEARGBROWDOWNEVEN_NEON

+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,

+                               int src_stepx,

+                               uint8* dst_ptr, int dst_width);

+#endif

+void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,

+                            int src_stepx,

+                            uint8* dst_ptr, int dst_width);

+static void ARGBTranspose(const uint8* src, int src_stride,

+                          uint8* dst, int dst_stride,

+                          int width, int height) {

+  int i;

+  int src_pixel_step = src_stride >> 2;

+  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,

+      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;

+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) &&  // Width of dest.

+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {

+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;

+  }

+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) &&  // Width of dest.

+      IS_ALIGNED(src, 4)) {

+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;

+  }

+#endif

+  for (i = 0; i < width; ++i) {  // column of source to row of dest.

+    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);

+    dst += dst_stride;

+    src += 4;

+  }

+}

+void ARGBRotate90(const uint8* src, int src_stride,

+                  uint8* dst, int dst_stride,

+                  int width, int height) {

+  // Rotate by 90 is a ARGBTranspose with the source read

+  // from bottom to top. So set the source pointer to the end

+  // of the buffer and flip the sign of the source stride.

+  src += src_stride * (height - 1);

+  src_stride = -src_stride;

+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);

+}

+void ARGBRotate270(const uint8* src, int src_stride,

+                    uint8* dst, int dst_stride,

+                    int width, int height) {

+  // Rotate by 270 is a ARGBTranspose with the destination written

+  // from bottom to top. So set the destination pointer to the end

+  // of the buffer and flip the sign of the destination stride.

+  dst += dst_stride * (width - 1);

+  dst_stride = -dst_stride;

+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);

+}

+void ARGBRotate180(const uint8* src, int src_stride,

+                   uint8* dst, int dst_stride,

+                   int width, int height) {

+  // Swap first and last row and mirror the content. Uses a temporary row.

+  align_buffer_64(row, width * 4);

+  const uint8* src_bot = src + src_stride * (height - 1);

+  uint8* dst_bot = dst + dst_stride * (height - 1);

+  int half_height = (height + 1) >> 1;

+  int y;

+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =

+      ARGBMirrorRow_C;

+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;

+#if defined(HAS_ARGBMIRRORROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&

+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {

+    ARGBMirrorRow = ARGBMirrorRow_SSSE3;

+  }

+#endif

+#if defined(HAS_ARGBMIRRORROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {

+    ARGBMirrorRow = ARGBMirrorRow_AVX2;

+  }

+#endif

+#if defined(HAS_ARGBMIRRORROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {

+    ARGBMirrorRow = ARGBMirrorRow_NEON;

+  }

+#endif

+#if defined(HAS_COPYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {

+    CopyRow = CopyRow_NEON;

+  }

+#endif

+#if defined(HAS_COPYROW_X86)

+  if (TestCpuFlag(kCpuHasX86)) {

+    CopyRow = CopyRow_X86;

+  }

+#endif

+#if defined(HAS_COPYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&

+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&

+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {

+    CopyRow = CopyRow_SSE2;

+  }

+#endif

+#if defined(HAS_COPYROW_ERMS)

+  if (TestCpuFlag(kCpuHasERMS)) {

+    CopyRow = CopyRow_ERMS;

+  }

+#endif

+#if defined(HAS_COPYROW_MIPS)

+  if (TestCpuFlag(kCpuHasMIPS)) {

+    CopyRow = CopyRow_MIPS;

+  }

+#endif

+  // Odd height will harmlessly mirror the middle row twice.

+  for (y = 0; y < half_height; ++y) {

+    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer

+    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row

+    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last

+    src += src_stride;

+    dst += dst_stride;

+    src_bot -= src_stride;

+    dst_bot -= dst_stride;

+  }

+  free_aligned_buffer_64(row);

+}

+LIBYUV_API

+int ARGBRotate(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height,

+               enum RotationMode mode) {

+  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  switch (mode) {

+    case kRotate0:

+      // copy frame

+      return ARGBCopy(src_argb, src_stride_argb,

+                      dst_argb, dst_stride_argb,

+                      width, height);

+    case kRotate90:

+      ARGBRotate90(src_argb, src_stride_argb,

+                   dst_argb, dst_stride_argb,

+                   width, height);

+      return 0;

+    case kRotate270:

+      ARGBRotate270(src_argb, src_stride_argb,

+                    dst_argb, dst_stride_argb,

+                    width, height);

+      return 0;

+    case kRotate180:

+      ARGBRotate180(src_argb, src_stride_argb,

+                    dst_argb, dst_stride_argb,

+                    width, height);

+      return 0;

+    default:

+      break;

+  }

+  return -1;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/rotate_mips.cc

@@ -1,0 +1,485 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/row.h"

+#include "libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#if !defined(LIBYUV_DISABLE_MIPS) && \

+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \

+    (_MIPS_SIM == _MIPS_SIM_ABI32)

+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,

+                             uint8* dst, int dst_stride,

+                             int width) {

+   __asm__ __volatile__ (

+      ".set push                                         \n"

+      ".set noreorder                                    \n"

+      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2

+      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4

+      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8

+      "addu             $t3, $t2, %[src_stride]          \n"

+      "addu             $t5, $t4, %[src_stride]          \n"

+      "addu             $t6, $t2, $t4                    \n"

+      "andi             $t0, %[dst], 0x3                 \n"

+      "andi             $t1, %[dst_stride], 0x3          \n"

+      "or               $t0, $t0, $t1                    \n"

+      "bnez             $t0, 11f                         \n"

+      " subu            $t7, $t9, %[src_stride]          \n"

+//dst + dst_stride word aligned

+    "1:                                                  \n"

+      "lbu              $t0, 0(%[src])                   \n"

+      "lbux             $t1, %[src_stride](%[src])       \n"

+      "lbux             $t8, $t2(%[src])                 \n"

+      "lbux             $t9, $t3(%[src])                 \n"

+      "sll              $t1, $t1, 16                     \n"

+      "sll              $t9, $t9, 16                     \n"

+      "or               $t0, $t0, $t1                    \n"

+      "or               $t8, $t8, $t9                    \n"

+      "precr.qb.ph      $s0, $t8, $t0                    \n"

+      "lbux             $t0, $t4(%[src])                 \n"

+      "lbux             $t1, $t5(%[src])                 \n"

+      "lbux             $t8, $t6(%[src])                 \n"

+      "lbux             $t9, $t7(%[src])                 \n"

+      "sll              $t1, $t1, 16                     \n"

+      "sll              $t9, $t9, 16                     \n"

+      "or               $t0, $t0, $t1                    \n"

+      "or               $t8, $t8, $t9                    \n"

+      "precr.qb.ph      $s1, $t8, $t0                    \n"

+      "sw               $s0, 0(%[dst])                   \n"

+      "addiu            %[width], -1                     \n"

+      "addiu            %[src], 1                        \n"

+      "sw               $s1, 4(%[dst])                   \n"

+      "bnez             %[width], 1b                     \n"

+      " addu            %[dst], %[dst], %[dst_stride]    \n"

+      "b                2f                               \n"

+//dst + dst_stride unaligned

+   "11:                                                  \n"

+      "lbu              $t0, 0(%[src])                   \n"

+      "lbux             $t1, %[src_stride](%[src])       \n"

+      "lbux             $t8, $t2(%[src])                 \n"

+      "lbux             $t9, $t3(%[src])                 \n"

+      "sll              $t1, $t1, 16                     \n"

+      "sll              $t9, $t9, 16                     \n"

+      "or               $t0, $t0, $t1                    \n"

+      "or               $t8, $t8, $t9                    \n"

+      "precr.qb.ph      $s0, $t8, $t0                    \n"

+      "lbux             $t0, $t4(%[src])                 \n"

+      "lbux             $t1, $t5(%[src])                 \n"

+      "lbux             $t8, $t6(%[src])                 \n"

+      "lbux             $t9, $t7(%[src])                 \n"

+      "sll              $t1, $t1, 16                     \n"

+      "sll              $t9, $t9, 16                     \n"

+      "or               $t0, $t0, $t1                    \n"

+      "or               $t8, $t8, $t9                    \n"

+      "precr.qb.ph      $s1, $t8, $t0                    \n"

+      "swr              $s0, 0(%[dst])                   \n"

+      "swl              $s0, 3(%[dst])                   \n"

+      "addiu            %[width], -1                     \n"

+      "addiu            %[src], 1                        \n"

+      "swr              $s1, 4(%[dst])                   \n"

+      "swl              $s1, 7(%[dst])                   \n"

+      "bnez             %[width], 11b                    \n"

+       "addu             %[dst], %[dst], %[dst_stride]   \n"

+    "2:                                                  \n"

+      ".set pop                                          \n"

+      :[src] "+r" (src),

+       [dst] "+r" (dst),

+       [width] "+r" (width)

+      :[src_stride] "r" (src_stride),

+       [dst_stride] "r" (dst_stride)

+      : "t0", "t1",  "t2", "t3", "t4", "t5",

+        "t6", "t7", "t8", "t9",

+        "s0", "s1"

+  );

+}

+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,

+                                  uint8* dst, int dst_stride,

+                                  int width) {

+  __asm__ __volatile__ (

+      ".set noat                                         \n"

+      ".set push                                         \n"

+      ".set noreorder                                    \n"

+      "beqz             %[width], 2f                     \n"

+      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2

+      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4

+      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8

+      "addu             $t3, $t2, %[src_stride]          \n"

+      "addu             $t5, $t4, %[src_stride]          \n"

+      "addu             $t6, $t2, $t4                    \n"

+      "srl              $AT, %[width], 0x2               \n"

+      "andi             $t0, %[dst], 0x3                 \n"

+      "andi             $t1, %[dst_stride], 0x3          \n"

+      "or               $t0, $t0, $t1                    \n"

+      "bnez             $t0, 11f                         \n"

+      " subu            $t7, $t9, %[src_stride]          \n"

+//dst + dst_stride word aligned

+      "1:                                                \n"

+      "lw               $t0, 0(%[src])                   \n"

+      "lwx              $t1, %[src_stride](%[src])       \n"

+      "lwx              $t8, $t2(%[src])                 \n"

+      "lwx              $t9, $t3(%[src])                 \n"

+// t0 = | 30 | 20 | 10 | 00 |

+// t1 = | 31 | 21 | 11 | 01 |

+// t8 = | 32 | 22 | 12 | 02 |

+// t9 = | 33 | 23 | 13 | 03 |

+      "precr.qb.ph     $s0, $t1, $t0                     \n"

+      "precr.qb.ph     $s1, $t9, $t8                     \n"

+      "precrq.qb.ph    $s2, $t1, $t0                     \n"

+      "precrq.qb.ph    $s3, $t9, $t8                     \n"

+  // s0 = | 21 | 01 | 20 | 00 |

+  // s1 = | 23 | 03 | 22 | 02 |

+  // s2 = | 31 | 11 | 30 | 10 |

+  // s3 = | 33 | 13 | 32 | 12 |

+      "precr.qb.ph     $s4, $s1, $s0                     \n"

+      "precrq.qb.ph    $s5, $s1, $s0                     \n"

+      "precr.qb.ph     $s6, $s3, $s2                     \n"

+      "precrq.qb.ph    $s7, $s3, $s2                     \n"

+  // s4 = | 03 | 02 | 01 | 00 |

+  // s5 = | 23 | 22 | 21 | 20 |

+  // s6 = | 13 | 12 | 11 | 10 |

+  // s7 = | 33 | 32 | 31 | 30 |

+      "lwx              $t0, $t4(%[src])                 \n"

+      "lwx              $t1, $t5(%[src])                 \n"

+      "lwx              $t8, $t6(%[src])                 \n"

+      "lwx              $t9, $t7(%[src])                 \n"

+// t0 = | 34 | 24 | 14 | 04 |

+// t1 = | 35 | 25 | 15 | 05 |

+// t8 = | 36 | 26 | 16 | 06 |

+// t9 = | 37 | 27 | 17 | 07 |

+      "precr.qb.ph     $s0, $t1, $t0                     \n"

+      "precr.qb.ph     $s1, $t9, $t8                     \n"

+      "precrq.qb.ph    $s2, $t1, $t0                     \n"

+      "precrq.qb.ph    $s3, $t9, $t8                     \n"

+  // s0 = | 25 | 05 | 24 | 04 |

+  // s1 = | 27 | 07 | 26 | 06 |

+  // s2 = | 35 | 15 | 34 | 14 |

+  // s3 = | 37 | 17 | 36 | 16 |

+      "precr.qb.ph     $t0, $s1, $s0                     \n"

+      "precrq.qb.ph    $t1, $s1, $s0                     \n"

+      "precr.qb.ph     $t8, $s3, $s2                     \n"

+      "precrq.qb.ph    $t9, $s3, $s2                     \n"

+  // t0 = | 07 | 06 | 05 | 04 |

+  // t1 = | 27 | 26 | 25 | 24 |

+  // t8 = | 17 | 16 | 15 | 14 |

+  // t9 = | 37 | 36 | 35 | 34 |

+      "addu            $s0, %[dst], %[dst_stride]        \n"

+      "addu            $s1, $s0, %[dst_stride]           \n"

+      "addu            $s2, $s1, %[dst_stride]           \n"

+      "sw              $s4, 0(%[dst])                    \n"

+      "sw              $t0, 4(%[dst])                    \n"

+      "sw              $s6, 0($s0)                       \n"

+      "sw              $t8, 4($s0)                       \n"

+      "sw              $s5, 0($s1)                       \n"

+      "sw              $t1, 4($s1)                       \n"

+      "sw              $s7, 0($s2)                       \n"

+      "sw              $t9, 4($s2)                       \n"

+      "addiu            $AT, -1                          \n"

+      "addiu            %[src], 4                        \n"

+      "bnez             $AT, 1b                          \n"

+      " addu            %[dst], $s2, %[dst_stride]       \n"

+      "b                2f                               \n"

+//dst + dst_stride unaligned

+      "11:                                               \n"

+      "lw               $t0, 0(%[src])                   \n"

+      "lwx              $t1, %[src_stride](%[src])       \n"

+      "lwx              $t8, $t2(%[src])                 \n"

+      "lwx              $t9, $t3(%[src])                 \n"

+// t0 = | 30 | 20 | 10 | 00 |

+// t1 = | 31 | 21 | 11 | 01 |

+// t8 = | 32 | 22 | 12 | 02 |

+// t9 = | 33 | 23 | 13 | 03 |

+      "precr.qb.ph     $s0, $t1, $t0                     \n"

+      "precr.qb.ph     $s1, $t9, $t8                     \n"

+      "precrq.qb.ph    $s2, $t1, $t0                     \n"

+      "precrq.qb.ph    $s3, $t9, $t8                     \n"

+  // s0 = | 21 | 01 | 20 | 00 |

+  // s1 = | 23 | 03 | 22 | 02 |

+  // s2 = | 31 | 11 | 30 | 10 |

+  // s3 = | 33 | 13 | 32 | 12 |

+      "precr.qb.ph     $s4, $s1, $s0                     \n"

+      "precrq.qb.ph    $s5, $s1, $s0                     \n"

+      "precr.qb.ph     $s6, $s3, $s2                     \n"

+      "precrq.qb.ph    $s7, $s3, $s2                     \n"

+  // s4 = | 03 | 02 | 01 | 00 |

+  // s5 = | 23 | 22 | 21 | 20 |

+  // s6 = | 13 | 12 | 11 | 10 |

+  // s7 = | 33 | 32 | 31 | 30 |

+      "lwx              $t0, $t4(%[src])                 \n"

+      "lwx              $t1, $t5(%[src])                 \n"

+      "lwx              $t8, $t6(%[src])                 \n"

+      "lwx              $t9, $t7(%[src])                 \n"

+// t0 = | 34 | 24 | 14 | 04 |

+// t1 = | 35 | 25 | 15 | 05 |

+// t8 = | 36 | 26 | 16 | 06 |

+// t9 = | 37 | 27 | 17 | 07 |

+      "precr.qb.ph     $s0, $t1, $t0                     \n"

+      "precr.qb.ph     $s1, $t9, $t8                     \n"

+      "precrq.qb.ph    $s2, $t1, $t0                     \n"

+      "precrq.qb.ph    $s3, $t9, $t8                     \n"

+  // s0 = | 25 | 05 | 24 | 04 |

+  // s1 = | 27 | 07 | 26 | 06 |

+  // s2 = | 35 | 15 | 34 | 14 |

+  // s3 = | 37 | 17 | 36 | 16 |

+      "precr.qb.ph     $t0, $s1, $s0                     \n"

+      "precrq.qb.ph    $t1, $s1, $s0                     \n"

+      "precr.qb.ph     $t8, $s3, $s2                     \n"

+      "precrq.qb.ph    $t9, $s3, $s2                     \n"

+  // t0 = | 07 | 06 | 05 | 04 |

+  // t1 = | 27 | 26 | 25 | 24 |

+  // t8 = | 17 | 16 | 15 | 14 |

+  // t9 = | 37 | 36 | 35 | 34 |

+      "addu            $s0, %[dst], %[dst_stride]        \n"

+      "addu            $s1, $s0, %[dst_stride]           \n"

+      "addu            $s2, $s1, %[dst_stride]           \n"

+      "swr              $s4, 0(%[dst])                   \n"

+      "swl              $s4, 3(%[dst])                   \n"

+      "swr              $t0, 4(%[dst])                   \n"

+      "swl              $t0, 7(%[dst])                   \n"

+      "swr              $s6, 0($s0)                      \n"

+      "swl              $s6, 3($s0)                      \n"

+      "swr              $t8, 4($s0)                      \n"

+      "swl              $t8, 7($s0)                      \n"

+      "swr              $s5, 0($s1)                      \n"

+      "swl              $s5, 3($s1)                      \n"

+      "swr              $t1, 4($s1)                      \n"

+      "swl              $t1, 7($s1)                      \n"

+      "swr              $s7, 0($s2)                      \n"

+      "swl              $s7, 3($s2)                      \n"

+      "swr              $t9, 4($s2)                      \n"

+      "swl              $t9, 7($s2)                      \n"

+      "addiu            $AT, -1                          \n"

+      "addiu            %[src], 4                        \n"

+      "bnez             $AT, 11b                         \n"

+      " addu            %[dst], $s2, %[dst_stride]       \n"

+      "2:                                                \n"

+      ".set pop                                          \n"

+      ".set at                                           \n"

+      :[src] "+r" (src),

+       [dst] "+r" (dst),

+       [width] "+r" (width)

+      :[src_stride] "r" (src_stride),

+       [dst_stride] "r" (dst_stride)

+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",

+        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"

+  );

+}

+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,

+                               uint8* dst_a, int dst_stride_a,

+                               uint8* dst_b, int dst_stride_b,

+                               int width) {

+  __asm__ __volatile__ (

+      ".set push                                         \n"

+      ".set noreorder                                    \n"

+      "beqz            %[width], 2f                      \n"

+      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2

+      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4

+      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8

+      "addu            $t3, $t2, %[src_stride]           \n"

+      "addu            $t5, $t4, %[src_stride]           \n"

+      "addu            $t6, $t2, $t4                     \n"

+      "subu            $t7, $t9, %[src_stride]           \n"

+      "srl             $t1, %[width], 1                  \n"

+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b

+      "andi            $t0, %[dst_a], 0x3                \n"

+      "andi            $t8, %[dst_b], 0x3                \n"

+      "or              $t0, $t0, $t8                     \n"

+      "andi            $t8, %[dst_stride_a], 0x3         \n"

+      "andi            $s5, %[dst_stride_b], 0x3         \n"

+      "or              $t8, $t8, $s5                     \n"

+      "or              $t0, $t0, $t8                     \n"

+      "bnez            $t0, 11f                          \n"

+      " nop                                              \n"

+// dst + dst_stride word aligned (both, a & b dst addresses)

+    "1:                                                  \n"

+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|

+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|

+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"

+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|

+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|

+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"

+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|

+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|

+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|

+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|

+      "sll             $t0, $t0, 16                      \n"

+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|

+      "sll             $t9, $t9, 16                      \n"

+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|

+      "sw              $s3, 0($s5)                       \n"

+      "sw              $s4, 0($s6)                       \n"

+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|

+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|

+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|

+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|

+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|

+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|

+      "sw              $s3, 0(%[dst_a])                  \n"

+      "sw              $s4, 0(%[dst_b])                  \n"

+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|

+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|

+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|

+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|

+      "sll             $t0, $t0, 16                      \n"

+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|

+      "sll             $t9, $t9, 16                      \n"

+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|

+      "sw              $s3, 4($s5)                       \n"

+      "sw              $s4, 4($s6)                       \n"

+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|

+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|

+      "addiu           %[src], 4                         \n"

+      "addiu           $t1, -1                           \n"

+      "sll             $t0, %[dst_stride_a], 1           \n"

+      "sll             $t8, %[dst_stride_b], 1           \n"

+      "sw              $s3, 4(%[dst_a])                  \n"

+      "sw              $s4, 4(%[dst_b])                  \n"

+      "addu            %[dst_a], %[dst_a], $t0           \n"

+      "bnez            $t1, 1b                           \n"

+      " addu           %[dst_b], %[dst_b], $t8           \n"

+      "b               2f                                \n"

+      " nop                                              \n"

+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned

+   "11:                                                  \n"

+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|

+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|

+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"

+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|

+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|

+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"

+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|

+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|

+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|

+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|

+      "sll             $t0, $t0, 16                      \n"

+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|

+      "sll             $t9, $t9, 16                      \n"

+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|

+      "swr             $s3, 0($s5)                       \n"

+      "swl             $s3, 3($s5)                       \n"

+      "swr             $s4, 0($s6)                       \n"

+      "swl             $s4, 3($s6)                       \n"

+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|

+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|

+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|

+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|

+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|

+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|

+      "swr             $s3, 0(%[dst_a])                  \n"

+      "swl             $s3, 3(%[dst_a])                  \n"

+      "swr             $s4, 0(%[dst_b])                  \n"

+      "swl             $s4, 3(%[dst_b])                  \n"

+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|

+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|

+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|

+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|

+      "sll             $t0, $t0, 16                      \n"

+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|

+      "sll             $t9, $t9, 16                      \n"

+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|

+      "swr             $s3, 4($s5)                       \n"

+      "swl             $s3, 7($s5)                       \n"

+      "swr             $s4, 4($s6)                       \n"

+      "swl             $s4, 7($s6)                       \n"

+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|

+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|

+      "addiu           %[src], 4                         \n"

+      "addiu           $t1, -1                           \n"

+      "sll             $t0, %[dst_stride_a], 1           \n"

+      "sll             $t8, %[dst_stride_b], 1           \n"

+      "swr             $s3, 4(%[dst_a])                  \n"

+      "swl             $s3, 7(%[dst_a])                  \n"

+      "swr             $s4, 4(%[dst_b])                  \n"

+      "swl             $s4, 7(%[dst_b])                  \n"

+      "addu            %[dst_a], %[dst_a], $t0           \n"

+      "bnez            $t1, 11b                          \n"

+      " addu           %[dst_b], %[dst_b], $t8           \n"

+      "2:                                                \n"

+      ".set pop                                          \n"

+      : [src] "+r" (src),

+        [dst_a] "+r" (dst_a),

+        [dst_b] "+r" (dst_b),

+        [width] "+r" (width),

+        [src_stride] "+r" (src_stride)

+      : [dst_stride_a] "r" (dst_stride_a),

+        [dst_stride_b] "r" (dst_stride_b)

+      : "t0", "t1",  "t2", "t3",  "t4", "t5",

+        "t6", "t7", "t8", "t9",

+        "s0", "s1", "s2", "s3",

+        "s4", "s5", "s6"

+  );

+}

+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/rotate_neon.cc

@@ -1,0 +1,533 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/row.h"

+#include "libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)

+static uvec8 kVTbl4x4Transpose =

+  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };

+void TransposeWx8_NEON(const uint8* src, int src_stride,

+                       uint8* dst, int dst_stride,

+                       int width) {

+  const uint8* src_temp = NULL;

+  asm volatile (

+    // loops are on blocks of 8. loop will stop when

+    // counter gets to or below 0. starting the counter

+    // at w-8 allow for this

+    "sub         %5, #8                        \n"

+    // handle 8x8 blocks. this should be the majority of the plane

+    ".p2align  2                               \n"

+    "1:                                        \n"

+      "mov         %0, %1                      \n"

+      MEMACCESS(0)

+      "vld1.8      {d0}, [%0], %2              \n"

+      MEMACCESS(0)

+      "vld1.8      {d1}, [%0], %2              \n"

+      MEMACCESS(0)

+      "vld1.8      {d2}, [%0], %2              \n"

+      MEMACCESS(0)

+      "vld1.8      {d3}, [%0], %2              \n"

+      MEMACCESS(0)

+      "vld1.8      {d4}, [%0], %2              \n"

+      MEMACCESS(0)

+      "vld1.8      {d5}, [%0], %2              \n"

+      MEMACCESS(0)

+      "vld1.8      {d6}, [%0], %2              \n"

+      MEMACCESS(0)

+      "vld1.8      {d7}, [%0]                  \n"

+      "vtrn.8      d1, d0                      \n"

+      "vtrn.8      d3, d2                      \n"

+      "vtrn.8      d5, d4                      \n"

+      "vtrn.8      d7, d6                      \n"

+      "vtrn.16     d1, d3                      \n"

+      "vtrn.16     d0, d2                      \n"

+      "vtrn.16     d5, d7                      \n"

+      "vtrn.16     d4, d6                      \n"

+      "vtrn.32     d1, d5                      \n"

+      "vtrn.32     d0, d4                      \n"

+      "vtrn.32     d3, d7                      \n"

+      "vtrn.32     d2, d6                      \n"

+      "vrev16.8    q0, q0                      \n"

+      "vrev16.8    q1, q1                      \n"

+      "vrev16.8    q2, q2                      \n"

+      "vrev16.8    q3, q3                      \n"

+      "mov         %0, %3                      \n"

+    MEMACCESS(0)

+      "vst1.8      {d1}, [%0], %4              \n"

+    MEMACCESS(0)

+      "vst1.8      {d0}, [%0], %4              \n"

+    MEMACCESS(0)

+      "vst1.8      {d3}, [%0], %4              \n"

+    MEMACCESS(0)

+      "vst1.8      {d2}, [%0], %4              \n"

+    MEMACCESS(0)

+      "vst1.8      {d5}, [%0], %4              \n"

+    MEMACCESS(0)

+      "vst1.8      {d4}, [%0], %4              \n"

+    MEMACCESS(0)

+      "vst1.8      {d7}, [%0], %4              \n"

+    MEMACCESS(0)

+      "vst1.8      {d6}, [%0]                  \n"

+      "add         %1, #8                      \n"  // src += 8

+      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride

+      "subs        %5,  #8                     \n"  // w   -= 8

+      "bge         1b                          \n"

+    // add 8 back to counter. if the result is 0 there are

+    // no residuals.

+    "adds        %5, #8                        \n"

+    "beq         4f                            \n"

+    // some residual, so between 1 and 7 lines left to transpose

+    "cmp         %5, #2                        \n"

+    "blt         3f                            \n"

+    "cmp         %5, #4                        \n"

+    "blt         2f                            \n"

+    // 4x8 block

+    "mov         %0, %1                        \n"

+    MEMACCESS(0)

+    "vld1.32     {d0[0]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.32     {d0[1]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.32     {d1[0]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.32     {d1[1]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.32     {d2[0]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.32     {d2[1]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.32     {d3[0]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.32     {d3[1]}, [%0]                 \n"

+    "mov         %0, %3                        \n"

+    MEMACCESS(6)

+    "vld1.8      {q3}, [%6]                    \n"

+    "vtbl.8      d4, {d0, d1}, d6              \n"

+    "vtbl.8      d5, {d0, d1}, d7              \n"

+    "vtbl.8      d0, {d2, d3}, d6              \n"

+    "vtbl.8      d1, {d2, d3}, d7              \n"

+    // TODO(frkoenig): Rework shuffle above to

+    // write out with 4 instead of 8 writes.

+    MEMACCESS(0)

+    "vst1.32     {d4[0]}, [%0], %4             \n"

+    MEMACCESS(0)

+    "vst1.32     {d4[1]}, [%0], %4             \n"

+    MEMACCESS(0)

+    "vst1.32     {d5[0]}, [%0], %4             \n"

+    MEMACCESS(0)

+    "vst1.32     {d5[1]}, [%0]                 \n"

+    "add         %0, %3, #4                    \n"

+    MEMACCESS(0)

+    "vst1.32     {d0[0]}, [%0], %4             \n"

+    MEMACCESS(0)

+    "vst1.32     {d0[1]}, [%0], %4             \n"

+    MEMACCESS(0)

+    "vst1.32     {d1[0]}, [%0], %4             \n"

+    MEMACCESS(0)

+    "vst1.32     {d1[1]}, [%0]                 \n"

+    "add         %1, #4                        \n"  // src += 4

+    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride

+    "subs        %5,  #4                       \n"  // w   -= 4

+    "beq         4f                            \n"

+    // some residual, check to see if it includes a 2x8 block,

+    // or less

+    "cmp         %5, #2                        \n"

+    "blt         3f                            \n"

+    // 2x8 block

+    "2:                                        \n"

+    "mov         %0, %1                        \n"

+    MEMACCESS(0)

+    "vld1.16     {d0[0]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.16     {d1[0]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.16     {d0[1]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.16     {d1[1]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.16     {d0[2]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.16     {d1[2]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.16     {d0[3]}, [%0], %2             \n"

+    MEMACCESS(0)

+    "vld1.16     {d1[3]}, [%0]                 \n"

+    "vtrn.8      d0, d1                        \n"

+    "mov         %0, %3                        \n"

+    MEMACCESS(0)

+    "vst1.64     {d0}, [%0], %4                \n"

+    MEMACCESS(0)

+    "vst1.64     {d1}, [%0]                    \n"

+    "add         %1, #2                        \n"  // src += 2

+    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride

+    "subs        %5,  #2                       \n"  // w   -= 2

+    "beq         4f                            \n"

+    // 1x8 block

+    "3:                                        \n"

+    MEMACCESS(1)

+    "vld1.8      {d0[0]}, [%1], %2             \n"

+    MEMACCESS(1)

+    "vld1.8      {d0[1]}, [%1], %2             \n"

+    MEMACCESS(1)

+    "vld1.8      {d0[2]}, [%1], %2             \n"

+    MEMACCESS(1)

+    "vld1.8      {d0[3]}, [%1], %2             \n"

+    MEMACCESS(1)

+    "vld1.8      {d0[4]}, [%1], %2             \n"

+    MEMACCESS(1)

+    "vld1.8      {d0[5]}, [%1], %2             \n"

+    MEMACCESS(1)

+    "vld1.8      {d0[6]}, [%1], %2             \n"

+    MEMACCESS(1)

+    "vld1.8      {d0[7]}, [%1]                 \n"

+    MEMACCESS(3)

+    "vst1.64     {d0}, [%3]                    \n"

+    "4:                                        \n"

+    : "+r"(src_temp),          // %0

+      "+r"(src),               // %1

+      "+r"(src_stride),        // %2

+      "+r"(dst),               // %3

+      "+r"(dst_stride),        // %4

+      "+r"(width)              // %5

+    : "r"(&kVTbl4x4Transpose)  // %6

+    : "memory", "cc", "q0", "q1", "q2", "q3"

+  );

+}

+static uvec8 kVTbl4x4TransposeDi =

+  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };

+void TransposeUVWx8_NEON(const uint8* src, int src_stride,

+                         uint8* dst_a, int dst_stride_a,

+                         uint8* dst_b, int dst_stride_b,

+                         int width) {

+  const uint8* src_temp = NULL;

+  asm volatile (

+    // loops are on blocks of 8. loop will stop when

+    // counter gets to or below 0. starting the counter

+    // at w-8 allow for this

+    "sub         %7, #8                        \n"

+    // handle 8x8 blocks. this should be the majority of the plane

+    ".p2align  2                               \n"

+    "1:                                        \n"

+      "mov         %0, %1                      \n"

+      MEMACCESS(0)

+      "vld2.8      {d0,  d1},  [%0], %2        \n"

+      MEMACCESS(0)

+      "vld2.8      {d2,  d3},  [%0], %2        \n"

+      MEMACCESS(0)

+      "vld2.8      {d4,  d5},  [%0], %2        \n"

+      MEMACCESS(0)

+      "vld2.8      {d6,  d7},  [%0], %2        \n"

+      MEMACCESS(0)

+      "vld2.8      {d16, d17}, [%0], %2        \n"

+      MEMACCESS(0)

+      "vld2.8      {d18, d19}, [%0], %2        \n"

+      MEMACCESS(0)

+      "vld2.8      {d20, d21}, [%0], %2        \n"

+      MEMACCESS(0)

+      "vld2.8      {d22, d23}, [%0]            \n"

+      "vtrn.8      q1, q0                      \n"

+      "vtrn.8      q3, q2                      \n"

+      "vtrn.8      q9, q8                      \n"

+      "vtrn.8      q11, q10                    \n"

+      "vtrn.16     q1, q3                      \n"

+      "vtrn.16     q0, q2                      \n"

+      "vtrn.16     q9, q11                     \n"

+      "vtrn.16     q8, q10                     \n"

+      "vtrn.32     q1, q9                      \n"

+      "vtrn.32     q0, q8                      \n"

+      "vtrn.32     q3, q11                     \n"

+      "vtrn.32     q2, q10                     \n"

+      "vrev16.8    q0, q0                      \n"

+      "vrev16.8    q1, q1                      \n"

+      "vrev16.8    q2, q2                      \n"

+      "vrev16.8    q3, q3                      \n"

+      "vrev16.8    q8, q8                      \n"

+      "vrev16.8    q9, q9                      \n"

+      "vrev16.8    q10, q10                    \n"

+      "vrev16.8    q11, q11                    \n"

+      "mov         %0, %3                      \n"

+    MEMACCESS(0)

+      "vst1.8      {d2},  [%0], %4             \n"

+    MEMACCESS(0)

+      "vst1.8      {d0},  [%0], %4             \n"

+    MEMACCESS(0)

+      "vst1.8      {d6},  [%0], %4             \n"

+    MEMACCESS(0)

+      "vst1.8      {d4},  [%0], %4             \n"

+    MEMACCESS(0)

+      "vst1.8      {d18}, [%0], %4             \n"

+    MEMACCESS(0)

+      "vst1.8      {d16}, [%0], %4             \n"

+    MEMACCESS(0)

+      "vst1.8      {d22}, [%0], %4             \n"

+    MEMACCESS(0)

+      "vst1.8      {d20}, [%0]                 \n"

+      "mov         %0, %5                      \n"

+    MEMACCESS(0)

+      "vst1.8      {d3},  [%0], %6             \n"

+    MEMACCESS(0)

+      "vst1.8      {d1},  [%0], %6             \n"

+    MEMACCESS(0)

+      "vst1.8      {d7},  [%0], %6             \n"

+    MEMACCESS(0)

+      "vst1.8      {d5},  [%0], %6             \n"

+    MEMACCESS(0)

+      "vst1.8      {d19}, [%0], %6             \n"

+    MEMACCESS(0)

+      "vst1.8      {d17}, [%0], %6             \n"

+    MEMACCESS(0)

+      "vst1.8      {d23}, [%0], %6             \n"

+    MEMACCESS(0)

+      "vst1.8      {d21}, [%0]                 \n"

+      "add         %1, #8*2                    \n"  // src   += 8*2

+      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a

+      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b

+      "subs        %7,  #8                     \n"  // w     -= 8

+      "bge         1b                          \n"

+    // add 8 back to counter. if the result is 0 there are

+    // no residuals.

+    "adds        %7, #8                        \n"

+    "beq         4f                            \n"

+    // some residual, so between 1 and 7 lines left to transpose

+    "cmp         %7, #2                        \n"

+    "blt         3f                            \n"

+    "cmp         %7, #4                        \n"

+    "blt         2f                            \n"

+    // TODO(frkoenig): Clean this up

+    // 4x8 block

+    "mov         %0, %1                        \n"

+    MEMACCESS(0)

+    "vld1.64     {d0}, [%0], %2                \n"

+    MEMACCESS(0)

+    "vld1.64     {d1}, [%0], %2                \n"

+    MEMACCESS(0)

+    "vld1.64     {d2}, [%0], %2                \n"

+    MEMACCESS(0)

+    "vld1.64     {d3}, [%0], %2                \n"

+    MEMACCESS(0)

+    "vld1.64     {d4}, [%0], %2                \n"

+    MEMACCESS(0)

+    "vld1.64     {d5}, [%0], %2                \n"

+    MEMACCESS(0)

+    "vld1.64     {d6}, [%0], %2                \n"

+    MEMACCESS(0)

+    "vld1.64     {d7}, [%0]                    \n"

+    MEMACCESS(8)

+    "vld1.8      {q15}, [%8]                   \n"

+    "vtrn.8      q0, q1                        \n"

+    "vtrn.8      q2, q3                        \n"

+    "vtbl.8      d16, {d0, d1}, d30            \n"

+    "vtbl.8      d17, {d0, d1}, d31            \n"

+    "vtbl.8      d18, {d2, d3}, d30            \n"

+    "vtbl.8      d19, {d2, d3}, d31            \n"

+    "vtbl.8      d20, {d4, d5}, d30            \n"

+    "vtbl.8      d21, {d4, d5}, d31            \n"

+    "vtbl.8      d22, {d6, d7}, d30            \n"

+    "vtbl.8      d23, {d6, d7}, d31            \n"

+    "mov         %0, %3                        \n"

+    MEMACCESS(0)

+    "vst1.32     {d16[0]},  [%0], %4           \n"

+    MEMACCESS(0)

+    "vst1.32     {d16[1]},  [%0], %4           \n"

+    MEMACCESS(0)

+    "vst1.32     {d17[0]},  [%0], %4           \n"

+    MEMACCESS(0)

+    "vst1.32     {d17[1]},  [%0], %4           \n"

+    "add         %0, %3, #4                    \n"

+    MEMACCESS(0)

+    "vst1.32     {d20[0]}, [%0], %4            \n"

+    MEMACCESS(0)

+    "vst1.32     {d20[1]}, [%0], %4            \n"

+    MEMACCESS(0)

+    "vst1.32     {d21[0]}, [%0], %4            \n"

+    MEMACCESS(0)

+    "vst1.32     {d21[1]}, [%0]                \n"

+    "mov         %0, %5                        \n"

+    MEMACCESS(0)

+    "vst1.32     {d18[0]}, [%0], %6            \n"

+    MEMACCESS(0)

+    "vst1.32     {d18[1]}, [%0], %6            \n"

+    MEMACCESS(0)

+    "vst1.32     {d19[0]}, [%0], %6            \n"

+    MEMACCESS(0)

+    "vst1.32     {d19[1]}, [%0], %6            \n"

+    "add         %0, %5, #4                    \n"

+    MEMACCESS(0)

+    "vst1.32     {d22[0]},  [%0], %6           \n"

+    MEMACCESS(0)

+    "vst1.32     {d22[1]},  [%0], %6           \n"

+    MEMACCESS(0)

+    "vst1.32     {d23[0]},  [%0], %6           \n"

+    MEMACCESS(0)

+    "vst1.32     {d23[1]},  [%0]               \n"

+    "add         %1, #4*2                      \n"  // src   += 4 * 2

+    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a

+    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b

+    "subs        %7,  #4                       \n"  // w     -= 4

+    "beq         4f                            \n"

+    // some residual, check to see if it includes a 2x8 block,

+    // or less

+    "cmp         %7, #2                        \n"

+    "blt         3f                            \n"

+    // 2x8 block

+    "2:                                        \n"

+    "mov         %0, %1                        \n"

+    MEMACCESS(0)

+    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"

+    MEMACCESS(0)

+    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"

+    MEMACCESS(0)

+    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"

+    MEMACCESS(0)

+    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"

+    MEMACCESS(0)

+    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"

+    MEMACCESS(0)

+    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"

+    MEMACCESS(0)

+    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"

+    MEMACCESS(0)

+    "vld2.16     {d1[3], d3[3]}, [%0]          \n"

+    "vtrn.8      d0, d1                        \n"

+    "vtrn.8      d2, d3                        \n"

+    "mov         %0, %3                        \n"

+    MEMACCESS(0)

+    "vst1.64     {d0}, [%0], %4                \n"

+    MEMACCESS(0)

+    "vst1.64     {d2}, [%0]                    \n"

+    "mov         %0, %5                        \n"

+    MEMACCESS(0)

+    "vst1.64     {d1}, [%0], %6                \n"

+    MEMACCESS(0)

+    "vst1.64     {d3}, [%0]                    \n"

+    "add         %1, #2*2                      \n"  // src   += 2 * 2

+    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a

+    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b

+    "subs        %7,  #2                       \n"  // w     -= 2

+    "beq         4f                            \n"

+    // 1x8 block

+    "3:                                        \n"

+    MEMACCESS(1)

+    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"

+    MEMACCESS(1)

+    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"

+    MEMACCESS(1)

+    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"

+    MEMACCESS(1)

+    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"

+    MEMACCESS(1)

+    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"

+    MEMACCESS(1)

+    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"

+    MEMACCESS(1)

+    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"

+    MEMACCESS(1)

+    "vld2.8      {d0[7], d1[7]}, [%1]          \n"

+    MEMACCESS(3)

+    "vst1.64     {d0}, [%3]                    \n"

+    MEMACCESS(5)

+    "vst1.64     {d1}, [%5]                    \n"

+    "4:                                        \n"

+    : "+r"(src_temp),            // %0

+      "+r"(src),                 // %1

+      "+r"(src_stride),          // %2

+      "+r"(dst_a),               // %3

+      "+r"(dst_stride_a),        // %4

+      "+r"(dst_b),               // %5

+      "+r"(dst_stride_b),        // %6

+      "+r"(width)                // %7

+    : "r"(&kVTbl4x4TransposeDi)  // %8

+    : "memory", "cc",

+      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"

+  );

+}

+#endif

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- a/third_party/libyuv/source/row_any.cc

+++ b/third_party/libyuv/source/row_any.cc

@@ -8,9 +8,9 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/row.h"

-#include "third_party/libyuv/include/libyuv/basic_types.h"

+#include "libyuv/basic_types.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -35,10 +35,12 @@

 #ifdef HAS_I422TOARGBROW_SSSE3

-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,

-     0, 4, 7)

 YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,

      1, 4, 7)

+#endif  // HAS_I422TOARGBROW_SSSE3

+#ifdef HAS_I444TOARGBROW_SSSE3

+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,

+     0, 4, 7)

 YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,

      2, 4, 7)

 YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,

@@ -59,7 +61,7 @@

 YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)

 YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)

 YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)

-#endif  // HAS_I422TOARGBROW_SSSE3

+#endif  // HAS_I444TOARGBROW_SSSE3

 #ifdef HAS_I422TOARGBROW_AVX2

 YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)

 #endif  // HAS_I422TOARGBROW_AVX2

--- a/third_party/libyuv/source/row_common.cc

+++ b/third_party/libyuv/source/row_common.cc

@@ -8,11 +8,11 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/row.h"

 #include <string.h>  // For memcpy and memset.

-#include "third_party/libyuv/include/libyuv/basic_types.h"

+#include "libyuv/basic_types.h"

 #ifdef __cplusplus

 namespace libyuv {

--- a/third_party/libyuv/source/row_mips.cc

+++ b/third_party/libyuv/source/row_mips.cc

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -16,7 +16,8 @@

 #endif

 // The following are available on Mips platforms:

-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)

+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \

+    (_MIPS_SIM == _MIPS_SIM_ABI32)

 #ifdef HAS_COPYROW_MIPS

 void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {

@@ -376,7 +377,9 @@

 // MIPS DSPR2 functions

 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \

-    (__mips_dsp_rev >= 2)

+    (__mips_dsp_rev >= 2) && \

+    (_MIPS_SIM == _MIPS_SIM_ABI32)

 void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

                            int width) {

   __asm__ __volatile__ (

--- a/third_party/libyuv/source/row_neon.cc

+++ b/third_party/libyuv/source/row_neon.cc

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -20,14 +20,20 @@

 // Read 8 Y, 4 U and 4 V from 422

 #define READYUV422                                                             \

+    MEMACCESS(0)                                                               \

     "vld1.8     {d0}, [%0]!                    \n"                             \

+    MEMACCESS(1)                                                               \

     "vld1.32    {d2[0]}, [%1]!                 \n"                             \

+    MEMACCESS(2)                                                               \

     "vld1.32    {d2[1]}, [%2]!                 \n"

 // Read 8 Y, 2 U and 2 V from 422

 #define READYUV411                                                             \

+    MEMACCESS(0)                                                               \

     "vld1.8     {d0}, [%0]!                    \n"                             \

+    MEMACCESS(1)                                                               \

     "vld1.16    {d2[0]}, [%1]!                 \n"                             \

+    MEMACCESS(2)                                                               \

     "vld1.16    {d2[1]}, [%2]!                 \n"                             \

     "vmov.u8    d3, d2                         \n"                             \

     "vzip.u8    d2, d3                         \n"

@@ -34,8 +40,11 @@

 // Read 8 Y, 8 U and 8 V from 444

 #define READYUV444                                                             \

+    MEMACCESS(0)                                                               \

     "vld1.8     {d0}, [%0]!                    \n"                             \

+    MEMACCESS(1)                                                               \

     "vld1.8     {d2}, [%1]!                    \n"                             \

+    MEMACCESS(2)                                                               \

     "vld1.8     {d3}, [%2]!                    \n"                             \

     "vpaddl.u8  q1, q1                         \n"                             \

     "vrshrn.u16 d2, q1, #1                     \n"

@@ -42,12 +51,15 @@

 // Read 8 Y, and set 4 U and 4 V to 128

 #define READYUV400                                                             \

+    MEMACCESS(0)                                                               \

     "vld1.8     {d0}, [%0]!                    \n"                             \

     "vmov.u8    d2, #128                       \n"

 // Read 8 Y and 4 UV from NV12

 #define READNV12                                                               \

+    MEMACCESS(0)                                                               \

     "vld1.8     {d0}, [%0]!                    \n"                             \

+    MEMACCESS(1)                                                               \

     "vld1.8     {d2}, [%1]!                    \n"                             \

     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\

     "vuzp.u8    d2, d3                         \n"                             \

@@ -55,7 +67,9 @@

 // Read 8 Y and 4 VU from NV21

 #define READNV21                                                               \

+    MEMACCESS(0)                                                               \

     "vld1.8     {d0}, [%0]!                    \n"                             \

+    MEMACCESS(1)                                                               \

     "vld1.8     {d2}, [%1]!                    \n"                             \

     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\

     "vuzp.u8    d3, d2                         \n"                             \

@@ -63,6 +77,7 @@

 // Read 8 YUY2

 #define READYUY2                                                               \

+    MEMACCESS(0)                                                               \

     "vld2.8     {d0, d2}, [%0]!                \n"                             \

     "vmov.u8    d3, d2                         \n"                             \

     "vuzp.u8    d2, d3                         \n"                             \

@@ -70,6 +85,7 @@

 // Read 8 UYVY

 #define READUYVY                                                               \

+    MEMACCESS(0)                                                               \

     "vld2.8     {d2, d3}, [%0]!                \n"                             \

     "vmov.u8    d0, d3                         \n"                             \

     "vmov.u8    d3, d2                         \n"                             \

@@ -113,7 +129,9 @@

                         uint8* dst_argb,

                         int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -124,6 +142,7 @@

     YUV422TORGB

     "subs       %4, %4, #8                     \n"

     "vmov.u8    d23, #255                      \n"

+    MEMACCESS(3)

     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -144,7 +163,9 @@

                         uint8* dst_argb,

                         int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -155,6 +176,7 @@

     YUV422TORGB

     "subs       %4, %4, #8                     \n"

     "vmov.u8    d23, #255                      \n"

+    MEMACCESS(3)

     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -175,7 +197,9 @@

                         uint8* dst_argb,

                         int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -186,6 +210,7 @@

     YUV422TORGB

     "subs       %4, %4, #8                     \n"

     "vmov.u8    d23, #255                      \n"

+    MEMACCESS(3)

     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -206,7 +231,9 @@

                         uint8* dst_bgra,

                         int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -218,6 +245,7 @@

     "subs       %4, %4, #8                     \n"

     "vswp.u8    d20, d22                       \n"

     "vmov.u8    d19, #255                      \n"

+    MEMACCESS(3)

     "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -238,7 +266,9 @@

                         uint8* dst_abgr,

                         int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -250,6 +280,7 @@

     "subs       %4, %4, #8                     \n"

     "vswp.u8    d20, d22                       \n"

     "vmov.u8    d23, #255                      \n"

+    MEMACCESS(3)

     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -270,7 +301,9 @@

                         uint8* dst_rgba,

                         int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -281,6 +314,7 @@

     YUV422TORGB

     "subs       %4, %4, #8                     \n"

     "vmov.u8    d19, #255                      \n"

+    MEMACCESS(3)

     "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -301,7 +335,9 @@

                          uint8* dst_rgb24,

                          int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -311,6 +347,7 @@

     READYUV422

     YUV422TORGB

     "subs       %4, %4, #8                     \n"

+    MEMACCESS(3)

     "vst3.8     {d20, d21, d22}, [%3]!         \n"

     "bgt        1b                             \n"

     : "+r"(src_y),      // %0

@@ -331,7 +368,9 @@

                        uint8* dst_raw,

                        int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -342,6 +381,7 @@

     YUV422TORGB

     "subs       %4, %4, #8                     \n"

     "vswp.u8    d20, d22                       \n"

+    MEMACCESS(3)

     "vst3.8     {d20, d21, d22}, [%3]!         \n"

     "bgt        1b                             \n"

     : "+r"(src_y),    // %0

@@ -374,7 +414,9 @@

                           uint8* dst_rgb565,

                           int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -385,6 +427,7 @@

     YUV422TORGB

     "subs       %4, %4, #8                     \n"

     ARGBTORGB565

+    MEMACCESS(3)

     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.

     "bgt        1b                             \n"

     : "+r"(src_y),    // %0

@@ -420,7 +463,9 @@

                             uint8* dst_argb1555,

                             int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -432,6 +477,7 @@

     "subs       %4, %4, #8                     \n"

     "vmov.u8    d23, #255                      \n"

     ARGBTOARGB1555

+    MEMACCESS(3)

     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.

     "bgt        1b                             \n"

     : "+r"(src_y),    // %0

@@ -461,7 +507,9 @@

                             uint8* dst_argb4444,

                             int width) {

   asm volatile (

+    MEMACCESS(5)

     "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

     "vld1.8     {d25}, [%6]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -474,6 +522,7 @@

     "subs       %4, %4, #8                     \n"

     "vmov.u8    d23, #255                      \n"

     ARGBTOARGB4444

+    MEMACCESS(3)

     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.

     "bgt        1b                             \n"

     : "+r"(src_y),    // %0

@@ -492,7 +541,9 @@

                      uint8* dst_argb,

                      int width) {

   asm volatile (

+    MEMACCESS(3)

     "vld1.8     {d24}, [%3]                    \n"

+    MEMACCESS(4)

     "vld1.8     {d25}, [%4]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -503,6 +554,7 @@

     YUV422TORGB

     "subs       %2, %2, #8                     \n"

     "vmov.u8    d23, #255                      \n"

+    MEMACCESS(1)

     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -522,10 +574,12 @@

     ".p2align   2                              \n"

     "vmov.u8    d23, #255                      \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {d20}, [%0]!                   \n"

     "vmov       d21, d20                       \n"

     "vmov       d22, d20                       \n"

     "subs       %2, %2, #8                     \n"

+    MEMACCESS(1)

     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -541,7 +595,9 @@

                         uint8* dst_argb,

                         int width) {

   asm volatile (

+    MEMACCESS(4)

     "vld1.8     {d24}, [%4]                    \n"

+    MEMACCESS(5)

     "vld1.8     {d25}, [%5]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -552,6 +608,7 @@

     YUV422TORGB

     "subs       %3, %3, #8                     \n"

     "vmov.u8    d23, #255                      \n"

+    MEMACCESS(2)

     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -570,7 +627,9 @@

                         uint8* dst_argb,

                         int width) {

   asm volatile (

+    MEMACCESS(4)

     "vld1.8     {d24}, [%4]                    \n"

+    MEMACCESS(5)

     "vld1.8     {d25}, [%5]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -581,6 +640,7 @@

     YUV422TORGB

     "subs       %3, %3, #8                     \n"

     "vmov.u8    d23, #255                      \n"

+    MEMACCESS(2)

     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -599,7 +659,9 @@

                           uint8* dst_rgb565,

                           int width) {

   asm volatile (

+    MEMACCESS(4)

     "vld1.8     {d24}, [%4]                    \n"

+    MEMACCESS(5)

     "vld1.8     {d25}, [%5]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -610,6 +672,7 @@

     YUV422TORGB

     "subs       %3, %3, #8                     \n"

     ARGBTORGB565

+    MEMACCESS(2)

     "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -628,7 +691,9 @@

                           uint8* dst_rgb565,

                           int width) {

   asm volatile (

+    MEMACCESS(4)

     "vld1.8     {d24}, [%4]                    \n"

+    MEMACCESS(5)

     "vld1.8     {d25}, [%5]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -639,6 +704,7 @@

     YUV422TORGB

     "subs       %3, %3, #8                     \n"

     ARGBTORGB565

+    MEMACCESS(2)

     "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.

     "bgt        1b                             \n"

     : "+r"(src_y),     // %0

@@ -656,7 +722,9 @@

                         uint8* dst_argb,

                         int width) {

   asm volatile (

+    MEMACCESS(3)

     "vld1.8     {d24}, [%3]                    \n"

+    MEMACCESS(4)

     "vld1.8     {d25}, [%4]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -667,6 +735,7 @@

     YUV422TORGB

     "subs       %2, %2, #8                     \n"

     "vmov.u8    d23, #255                      \n"

+    MEMACCESS(1)

     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_yuy2),  // %0

@@ -683,7 +752,9 @@

                         uint8* dst_argb,

                         int width) {

   asm volatile (

+    MEMACCESS(3)

     "vld1.8     {d24}, [%3]                    \n"

+    MEMACCESS(4)

     "vld1.8     {d25}, [%4]                    \n"

     "vmov.u8    d26, #128                      \n"

     "vmov.u16   q14, #74                       \n"

@@ -694,6 +765,7 @@

     YUV422TORGB

     "subs       %2, %2, #8                     \n"

     "vmov.u8    d23, #255                      \n"

+    MEMACCESS(1)

     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

     "bgt        1b                             \n"

     : "+r"(src_uyvy),  // %0

@@ -712,9 +784,12 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV

     "subs       %3, %3, #16                    \n"  // 16 processed per loop

+    MEMACCESS(1)

     "vst1.8     {q0}, [%1]!                    \n"  // store U

+    MEMACCESS(2)

     "vst1.8     {q1}, [%2]!                    \n"  // store V

     "bgt        1b                             \n"

     : "+r"(src_uv),  // %0

@@ -732,9 +807,12 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load U

+    MEMACCESS(1)

     "vld1.8     {q1}, [%1]!                    \n"  // load V

     "subs       %3, %3, #16                    \n"  // 16 processed per loop

+    MEMACCESS(2)

     "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV

     "bgt        1b                             \n"

@@ -752,8 +830,10 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32

     "subs       %2, %2, #32                    \n"  // 32 processed per loop

+    MEMACCESS(1)

     "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32

     "bgt        1b                             \n"

   : "+r"(src),   // %0

@@ -770,6 +850,7 @@

     "vdup.u32  q0, %2                          \n"  // duplicate 4 ints

     "1:                                        \n"

     "subs      %1, %1, #16                     \n"  // 16 bytes per loop

+    MEMACCESS(0)

     "vst1.8    {q0}, [%0]!                     \n"  // store

     "bgt       1b                              \n"

   : "+r"(dst),   // %0

@@ -798,10 +879,13 @@

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

     "subs       %2, #16                        \n"  // 16 pixels per loop.

     "vrev64.8   q0, q0                         \n"

+    MEMACCESS(1)

     "vst1.8     {d1}, [%1]!                    \n"  // dst += 16

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"

     "bgt        1b                             \n"

   : "+r"(src),   // %0

@@ -822,10 +906,13 @@

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16

     "subs       %3, #8                         \n"  // 8 pixels per loop.

     "vrev64.8   q0, q0                         \n"

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // dst += 8

+    MEMACCESS(2)

     "vst1.8     {d1}, [%2]!                    \n"

     "bgt        1b                             \n"

   : "+r"(src_uv),  // %0

@@ -846,10 +933,13 @@

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

     "subs       %2, #4                         \n"  // 4 pixels per loop.

     "vrev64.32  q0, q0                         \n"

+    MEMACCESS(1)

     "vst1.8     {d1}, [%1]!                    \n"  // dst += 16

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"

     "bgt        1b                             \n"

   : "+r"(src),   // %0

@@ -865,8 +955,10 @@

     "vmov.u8    d4, #255                       \n"  // Alpha

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    MEMACCESS(1)

     "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.

     "bgt        1b                             \n"

   : "+r"(src_rgb24),  // %0

@@ -882,9 +974,11 @@

     "vmov.u8    d4, #255                       \n"  // Alpha

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vswp.u8    d1, d3                         \n"  // swap R, B

+    MEMACCESS(1)

     "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.

     "bgt        1b                             \n"

   : "+r"(src_raw),   // %0

@@ -912,9 +1006,11 @@

     "vmov.u8    d3, #255                       \n"  // Alpha

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     RGB565TOARGB

+    MEMACCESS(1)

     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

     "bgt        1b                             \n"

   : "+r"(src_rgb565),  // %0

@@ -958,9 +1054,11 @@

     "vmov.u8    d3, #255                       \n"  // Alpha

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     ARGB1555TOARGB

+    MEMACCESS(1)

     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

     "bgt        1b                             \n"

   : "+r"(src_argb1555),  // %0

@@ -987,9 +1085,11 @@

     "vmov.u8    d3, #255                       \n"  // Alpha

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     ARGB4444TOARGB

+    MEMACCESS(1)

     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

     "bgt        1b                             \n"

   : "+r"(src_argb4444),  // %0

@@ -1004,8 +1104,10 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    MEMACCESS(1)

     "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.

     "bgt        1b                             \n"

   : "+r"(src_argb),   // %0

@@ -1020,9 +1122,11 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vswp.u8    d1, d3                         \n"  // swap R, B

+    MEMACCESS(1)

     "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1037,8 +1141,10 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.

     "subs       %2, %2, #16                    \n"  // 16 processed per loop.

+    MEMACCESS(1)

     "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.

     "bgt        1b                             \n"

   : "+r"(src_yuy2),  // %0

@@ -1053,8 +1159,10 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.

     "subs       %2, %2, #16                    \n"  // 16 processed per loop.

+    MEMACCESS(1)

     "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.

     "bgt        1b                             \n"

   : "+r"(src_uyvy),  // %0

@@ -1070,9 +1178,12 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.

+    MEMACCESS(1)

     "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.

+    MEMACCESS(2)

     "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.

     "bgt        1b                             \n"

   : "+r"(src_yuy2),  // %0

@@ -1089,9 +1200,12 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.

+    MEMACCESS(2)

     "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.

     "bgt        1b                             \n"

   : "+r"(src_uyvy),  // %0

@@ -1109,12 +1223,16 @@

     "add        %1, %0, %1                     \n"  // stride + src_yuy2

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.

+    MEMACCESS(1)

     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.

     "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U

     "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V

+    MEMACCESS(2)

     "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.

+    MEMACCESS(3)

     "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.

     "bgt        1b                             \n"

   : "+r"(src_yuy2),     // %0

@@ -1133,12 +1251,16 @@

     "add        %1, %0, %1                     \n"  // stride + src_uyvy

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.

+    MEMACCESS(1)

     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.

     "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U

     "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.

+    MEMACCESS(3)

     "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.

     "bgt        1b                             \n"

   : "+r"(src_uyvy),     // %0

@@ -1157,10 +1279,13 @@

     // change the stride to row 2 pointer

     "add        %1, %0                         \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.

     "subs       %3, %3, #16                    \n"  // 16 processed per loop

+    MEMACCESS(1)

     "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.

     "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2

+    MEMACCESS(2)

     "vst1.8     {q0}, [%2]!                    \n"

     "bgt        1b                             \n"

   : "+r"(src_uv),         // %0

@@ -1178,11 +1303,13 @@

   asm volatile (

     "vmov.u32   d6[0], %3                      \n"  // selector

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop

     "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels

     "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels

     "vtrn.u32   d4, d5                         \n"  // combine 8 pixels

+    MEMACCESS(1)

     "vst1.8     {d4}, [%1]!                    \n"  // store 8.

     "bgt        1b                             \n"

   : "+r"(src_argb),   // %0

@@ -1198,8 +1325,10 @@

                            uint32 /*selector*/, int pix) {

   asm volatile (

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop

+    MEMACCESS(1)

     "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.

     "bgt        1b                             \n"

   : "+r"(src_argb),   // %0

@@ -1214,12 +1343,15 @@

 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,

                          const uint8* shuffler, int pix) {

   asm volatile (

+    MEMACCESS(3)

     "vld1.8     {q2}, [%3]                     \n"  // shuffler

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.

     "subs       %2, %2, #4                     \n"  // 4 processed per loop

     "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels

     "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels

+    MEMACCESS(1)

     "vst1.8     {q1}, [%1]!                    \n"  // store 4.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1237,10 +1369,14 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys

+    MEMACCESS(1)

     "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us

+    MEMACCESS(2)

     "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs

     "subs       %4, %4, #16                    \n"  // 16 pixels

+    MEMACCESS(3)

     "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.

     "bgt        1b                             \n"

   : "+r"(src_y),     // %0

@@ -1260,10 +1396,14 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys

+    MEMACCESS(1)

     "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us

+    MEMACCESS(2)

     "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs

     "subs       %4, %4, #16                    \n"  // 16 pixels

+    MEMACCESS(3)

     "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.

     "bgt        1b                             \n"

   : "+r"(src_y),     // %0

@@ -1280,9 +1420,11 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     ARGBTORGB565

+    MEMACCESS(1)

     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1298,9 +1440,11 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     ARGBTOARGB1555

+    MEMACCESS(1)

     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1317,9 +1461,11 @@

     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     ARGBTOARGB4444

+    MEMACCESS(1)

     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.

     "bgt        1b                             \n"

   : "+r"(src_argb),      // %0

@@ -1338,6 +1484,7 @@

     "vmov.u8    d27, #16                       \n"  // Add 16 constant

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q2, d0, d24                    \n"  // B

@@ -1345,6 +1492,7 @@

     "vmlal.u8   q2, d2, d26                    \n"  // R

     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

     "vqadd.u8   d0, d27                        \n"

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1362,6 +1510,7 @@

     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q2, d0, d24                    \n"  // B

@@ -1368,6 +1517,7 @@

     "vmlal.u8   q2, d1, d25                    \n"  // G

     "vmlal.u8   q2, d2, d26                    \n"  // R

     "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1390,6 +1540,7 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

     "subs       %3, %3, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q2, d0, d24                    \n"  // B

@@ -1405,7 +1556,9 @@

     "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U

     "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

+    MEMACCESS(2)

     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1429,7 +1582,9 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

@@ -1450,7 +1605,9 @@

     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

+    MEMACCESS(2)

     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1475,12 +1632,16 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(0)

     "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.

+    MEMACCESS(0)

     "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.

     "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1508,7 +1669,9 @@

     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

+    MEMACCESS(2)

     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1547,12 +1710,16 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.

+    MEMACCESS(1)

     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.

     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1564,7 +1731,9 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q0, q1, q2)

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1591,12 +1760,16 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.

+    MEMACCESS(1)

     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.

     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1608,7 +1781,9 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q0, q1, q2)

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1634,12 +1809,16 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.

+    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.

     "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.

+    MEMACCESS(1)

     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.

     "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.

@@ -1651,7 +1830,9 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q3, q2, q1)

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_bgra),  // %0

@@ -1677,12 +1858,16 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.

+    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.

     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.

+    MEMACCESS(1)

     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.

     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1694,7 +1879,9 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q2, q1, q0)

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_abgr),  // %0

@@ -1720,12 +1907,16 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.

+    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.

     "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.

+    MEMACCESS(1)

     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.

     "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.

@@ -1737,7 +1928,9 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q0, q1, q2)

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_rgba),  // %0

@@ -1763,12 +1956,16 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.

+    MEMACCESS(0)

     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.

     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.

+    MEMACCESS(1)

     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.

     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1780,7 +1977,9 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q0, q1, q2)

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_rgb24),  // %0

@@ -1806,12 +2005,16 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.

+    MEMACCESS(0)

     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.

     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.

+    MEMACCESS(1)

     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.

     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1823,7 +2026,9 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q2, q1, q0)

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_raw),  // %0

@@ -1850,11 +2055,13 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

     RGB565TOARGB

     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.

     RGB565TOARGB

     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

@@ -1861,11 +2068,13 @@

     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

     "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.

     RGB565TOARGB

     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

     "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.

     RGB565TOARGB

     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

@@ -1887,7 +2096,9 @@

     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_rgb565),  // %0

@@ -1914,11 +2125,13 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

     RGB555TOARGB

     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.

     RGB555TOARGB

     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

@@ -1925,11 +2138,13 @@

     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

     "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.

     RGB555TOARGB

     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

     "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.

     RGB555TOARGB

     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

@@ -1951,7 +2166,9 @@

     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_argb1555),  // %0

@@ -1978,11 +2195,13 @@

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

     ARGB4444TOARGB

     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.

     ARGB4444TOARGB

     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

@@ -1989,11 +2208,13 @@

     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

     "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.

     ARGB4444TOARGB

     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

     "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.

     ARGB4444TOARGB

     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

@@ -2015,7 +2236,9 @@

     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_argb4444),  // %0

@@ -2037,6 +2260,7 @@

     "vmov.u8    d27, #16                       \n"  // Add 16 constant

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     RGB565TOARGB

@@ -2045,6 +2269,7 @@

     "vmlal.u8   q2, d2, d26                    \n"  // R

     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

     "vqadd.u8   d0, d27                        \n"

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

     "bgt        1b                             \n"

   : "+r"(src_rgb565),  // %0

@@ -2063,6 +2288,7 @@

     "vmov.u8    d27, #16                       \n"  // Add 16 constant

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     ARGB1555TOARGB

@@ -2071,6 +2297,7 @@

     "vmlal.u8   q2, d2, d26                    \n"  // R

     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

     "vqadd.u8   d0, d27                        \n"

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

     "bgt        1b                             \n"

   : "+r"(src_argb1555),  // %0

@@ -2089,6 +2316,7 @@

     "vmov.u8    d27, #16                       \n"  // Add 16 constant

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     ARGB4444TOARGB

@@ -2097,6 +2325,7 @@

     "vmlal.u8   q2, d2, d26                    \n"  // R

     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

     "vqadd.u8   d0, d27                        \n"

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

     "bgt        1b                             \n"

   : "+r"(src_argb4444),  // %0

@@ -2115,6 +2344,7 @@

     "vmov.u8    d7, #16                        \n"  // Add 16 constant

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q8, d1, d4                     \n"  // R

@@ -2122,6 +2352,7 @@

     "vmlal.u8   q8, d3, d6                     \n"  // B

     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

     "vqadd.u8   d0, d7                         \n"

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

     "bgt        1b                             \n"

   : "+r"(src_bgra),  // %0

@@ -2140,6 +2371,7 @@

     "vmov.u8    d7, #16                        \n"  // Add 16 constant

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q8, d0, d4                     \n"  // R

@@ -2147,6 +2379,7 @@

     "vmlal.u8   q8, d2, d6                     \n"  // B

     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

     "vqadd.u8   d0, d7                         \n"

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

     "bgt        1b                             \n"

   : "+r"(src_abgr),  // %0

@@ -2165,6 +2398,7 @@

     "vmov.u8    d7, #16                        \n"  // Add 16 constant

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q8, d1, d4                     \n"  // B

@@ -2172,6 +2406,7 @@

     "vmlal.u8   q8, d3, d6                     \n"  // R

     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

     "vqadd.u8   d0, d7                         \n"

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

     "bgt        1b                             \n"

   : "+r"(src_rgba),  // %0

@@ -2190,6 +2425,7 @@

     "vmov.u8    d7, #16                        \n"  // Add 16 constant

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q8, d0, d4                     \n"  // B

@@ -2197,6 +2433,7 @@

     "vmlal.u8   q8, d2, d6                     \n"  // R

     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

     "vqadd.u8   d0, d7                         \n"

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

     "bgt        1b                             \n"

   : "+r"(src_rgb24),  // %0

@@ -2215,6 +2452,7 @@

     "vmov.u8    d7, #16                        \n"  // Add 16 constant

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q8, d0, d4                     \n"  // B

@@ -2222,6 +2460,7 @@

     "vmlal.u8   q8, d2, d6                     \n"  // R

     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

     "vqadd.u8   d0, d7                         \n"

+    MEMACCESS(1)

     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

     "bgt        1b                             \n"

   : "+r"(src_raw),  // %0

@@ -2252,7 +2491,9 @@

     "vdup.8     d4, %4                         \n"

     // General purpose row blend.

   "1:                                          \n"

+    MEMACCESS(1)

     "vld1.8     {q0}, [%1]!                    \n"

+    MEMACCESS(2)

     "vld1.8     {q1}, [%2]!                    \n"

     "subs       %3, %3, #16                    \n"

     "vmull.u8   q13, d0, d4                    \n"

@@ -2261,6 +2502,7 @@

     "vmlal.u8   q14, d3, d5                    \n"

     "vrshrn.u16 d0, q13, #8                    \n"

     "vrshrn.u16 d1, q14, #8                    \n"

+    MEMACCESS(0)

     "vst1.8     {q0}, [%0]!                    \n"

     "bgt        1b                             \n"

     "b          99f                            \n"

@@ -2267,11 +2509,14 @@

     // Blend 25 / 75.

   "25:                                         \n"

+    MEMACCESS(1)

     "vld1.8     {q0}, [%1]!                    \n"

+    MEMACCESS(2)

     "vld1.8     {q1}, [%2]!                    \n"

     "subs       %3, %3, #16                    \n"

     "vrhadd.u8  q0, q1                         \n"

     "vrhadd.u8  q0, q1                         \n"

+    MEMACCESS(0)

     "vst1.8     {q0}, [%0]!                    \n"

     "bgt        25b                            \n"

     "b          99f                            \n"

@@ -2278,10 +2523,13 @@

     // Blend 50 / 50.

   "50:                                         \n"

+    MEMACCESS(1)

     "vld1.8     {q0}, [%1]!                    \n"

+    MEMACCESS(2)

     "vld1.8     {q1}, [%2]!                    \n"

     "subs       %3, %3, #16                    \n"

     "vrhadd.u8  q0, q1                         \n"

+    MEMACCESS(0)

     "vst1.8     {q0}, [%0]!                    \n"

     "bgt        50b                            \n"

     "b          99f                            \n"

@@ -2288,11 +2536,14 @@

     // Blend 75 / 25.

   "75:                                         \n"

+    MEMACCESS(1)

     "vld1.8     {q1}, [%1]!                    \n"

+    MEMACCESS(2)

     "vld1.8     {q0}, [%2]!                    \n"

     "subs       %3, %3, #16                    \n"

     "vrhadd.u8  q0, q1                         \n"

     "vrhadd.u8  q0, q1                         \n"

+    MEMACCESS(0)

     "vst1.8     {q0}, [%0]!                    \n"

     "bgt        75b                            \n"

     "b          99f                            \n"

@@ -2299,8 +2550,10 @@

     // Blend 100 / 0 - Copy row unchanged.

   "100:                                        \n"

+    MEMACCESS(1)

     "vld1.8     {q0}, [%1]!                    \n"

     "subs       %3, %3, #16                    \n"

+    MEMACCESS(0)

     "vst1.8     {q0}, [%0]!                    \n"

     "bgt        100b                           \n"

@@ -2323,7 +2576,9 @@

     "blt        89f                            \n"

     // Blend 8 pixels.

   "8:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.

+    MEMACCESS(1)

     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.

     "subs       %3, %3, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q10, d4, d3                    \n"  // db * a

@@ -2337,6 +2592,7 @@

     "vqadd.u8   q0, q0, q2                     \n"  // + sbg

     "vqadd.u8   d2, d2, d6                     \n"  // + sr

     "vmov.u8    d3, #255                       \n"  // a = 255

+    MEMACCESS(2)

     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.

     "bge        8b                             \n"

@@ -2346,7 +2602,9 @@

     // Blend 1 pixels.

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.

+    MEMACCESS(1)

     "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.

     "subs       %3, %3, #1                     \n"  // 1 processed per loop.

     "vmull.u8   q10, d4, d3                    \n"  // db * a

@@ -2360,6 +2618,7 @@

     "vqadd.u8   q0, q0, q2                     \n"  // + sbg

     "vqadd.u8   d2, d2, d6                     \n"  // + sr

     "vmov.u8    d3, #255                       \n"  // a = 255

+    MEMACCESS(2)

     "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.

     "bge        1b                             \n"

@@ -2379,6 +2638,7 @@

   asm volatile (

     // Attenuate 8 pixels.

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q10, d0, d3                    \n"  // b * a

@@ -2387,6 +2647,7 @@

     "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8

     "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8

     "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8

+    MEMACCESS(1)

     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

     "bgt        1b                             \n"

   : "+r"(src_argb),   // %0

@@ -2410,6 +2671,7 @@

     // 8 pixel loop.

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.

     "subs       %1, %1, #8                     \n"  // 8 processed per loop.

     "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)

@@ -2427,6 +2689,7 @@

     "vqmovn.u16 d0, q0                         \n"

     "vqmovn.u16 d2, q1                         \n"

     "vqmovn.u16 d4, q2                         \n"

+    MEMACCESS(0)

     "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.

     "bgt        1b                             \n"

   : "+r"(dst_argb),       // %0

@@ -2451,6 +2714,7 @@

     // 8 pixel loop.

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)

@@ -2465,6 +2729,7 @@

     "vqmovn.u16 d22, q11                       \n"

     "vqmovn.u16 d24, q12                       \n"

     "vqmovn.u16 d26, q13                       \n"

+    MEMACCESS(1)

     "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.

     "bgt        1b                             \n"

   : "+r"(src_argb),       // %0

@@ -2485,6 +2750,7 @@

     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q2, d0, d24                    \n"  // B

@@ -2493,6 +2759,7 @@

     "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B

     "vmov       d1, d0                         \n"  // G

     "vmov       d2, d0                         \n"  // R

+    MEMACCESS(1)

     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -2520,6 +2787,7 @@

     "vmov.u8    d30, #50                       \n"  // BR coefficient

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.

     "subs       %1, %1, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B

@@ -2534,6 +2802,7 @@

     "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B

     "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G

     "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R

+    MEMACCESS(0)

     "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.

     "bgt        1b                             \n"

   : "+r"(dst_argb),  // %0

@@ -2550,6 +2819,7 @@

 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,

                              const int8* matrix_argb, int width) {

   asm volatile (

+    MEMACCESS(3)

     "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.

     "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.

     "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.

@@ -2556,6 +2826,7 @@

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.

     "subs       %2, %2, #8                     \n"  // 8 processed per loop.

     "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit

@@ -2594,6 +2865,7 @@

     "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G

     "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R

     "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A

+    MEMACCESS(1)

     "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.

     "bgt        1b                             \n"

   : "+r"(src_argb),   // %0

@@ -2614,7 +2886,9 @@

     // 8 pixel loop.

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(1)

     "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

     "subs       %3, %3, #8                     \n"  // 8 processed per loop.

     "vmull.u8   q0, d0, d1                     \n"  // multiply B

@@ -2625,6 +2899,7 @@

     "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G

     "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R

     "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A

+    MEMACCESS(2)

     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

     "bgt        1b                             \n"

@@ -2645,11 +2920,14 @@

     // 8 pixel loop.

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(1)

     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

     "subs       %3, %3, #8                     \n"  // 8 processed per loop.

     "vqadd.u8   q0, q0, q2                     \n"  // add B, G

     "vqadd.u8   q1, q1, q3                     \n"  // add R, A

+    MEMACCESS(2)

     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

     "bgt        1b                             \n"

@@ -2669,11 +2947,14 @@

     // 8 pixel loop.

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(1)

     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

     "subs       %3, %3, #8                     \n"  // 8 processed per loop.

     "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G

     "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A

+    MEMACCESS(2)

     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

     "bgt        1b                             \n"

@@ -2698,12 +2979,15 @@

     // 8 pixel loop.

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.

+    MEMACCESS(1)

     "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.

     "subs       %3, %3, #8                     \n"  // 8 processed per loop.

     "vqadd.u8   d0, d0, d1                     \n"  // add

     "vmov.u8    d1, d0                         \n"

     "vmov.u8    d2, d0                         \n"

+    MEMACCESS(2)

     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

     "bgt        1b                             \n"

   : "+r"(src_sobelx),  // %0

@@ -2722,10 +3006,13 @@

     // 16 pixel loop.

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.

+    MEMACCESS(1)

     "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.

     "subs       %3, %3, #16                    \n"  // 16 processed per loop.

     "vqadd.u8   q0, q0, q1                     \n"  // add

+    MEMACCESS(2)

     "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.

     "bgt        1b                             \n"

   : "+r"(src_sobelx),  // %0

@@ -2749,10 +3036,13 @@

     // 8 pixel loop.

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.

+    MEMACCESS(1)

     "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.

     "subs       %3, %3, #8                     \n"  // 8 processed per loop.

     "vqadd.u8   d1, d0, d2                     \n"  // add

+    MEMACCESS(2)

     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

     "bgt        1b                             \n"

   : "+r"(src_sobelx),  // %0

@@ -2773,15 +3063,21 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {d0}, [%0],%5                  \n"  // top

+    MEMACCESS(0)

     "vld1.8     {d1}, [%0],%6                  \n"

     "vsubl.u8   q0, d0, d1                     \n"

+    MEMACCESS(1)

     "vld1.8     {d2}, [%1],%5                  \n"  // center * 2

+    MEMACCESS(1)

     "vld1.8     {d3}, [%1],%6                  \n"

     "vsubl.u8   q1, d2, d3                     \n"

     "vadd.s16   q0, q0, q1                     \n"

     "vadd.s16   q0, q0, q1                     \n"

+    MEMACCESS(2)

     "vld1.8     {d2}, [%2],%5                  \n"  // bottom

+    MEMACCESS(2)

     "vld1.8     {d3}, [%2],%6                  \n"

     "subs       %4, %4, #8                     \n"  // 8 pixels

     "vsubl.u8   q1, d2, d3                     \n"

@@ -2788,6 +3084,7 @@

     "vadd.s16   q0, q0, q1                     \n"

     "vabs.s16   q0, q0                         \n"

     "vqmovn.u16 d0, q0                         \n"

+    MEMACCESS(3)

     "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx

     "bgt        1b                             \n"

   : "+r"(src_y0),      // %0

@@ -2810,15 +3107,21 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {d0}, [%0],%4                  \n"  // left

+    MEMACCESS(1)

     "vld1.8     {d1}, [%1],%4                  \n"

     "vsubl.u8   q0, d0, d1                     \n"

+    MEMACCESS(0)

     "vld1.8     {d2}, [%0],%4                  \n"  // center * 2

+    MEMACCESS(1)

     "vld1.8     {d3}, [%1],%4                  \n"

     "vsubl.u8   q1, d2, d3                     \n"

     "vadd.s16   q0, q0, q1                     \n"

     "vadd.s16   q0, q0, q1                     \n"

+    MEMACCESS(0)

     "vld1.8     {d2}, [%0],%5                  \n"  // right

+    MEMACCESS(1)

     "vld1.8     {d3}, [%1],%5                  \n"

     "subs       %3, %3, #8                     \n"  // 8 pixels

     "vsubl.u8   q1, d2, d3                     \n"

@@ -2825,6 +3128,7 @@

     "vadd.s16   q0, q0, q1                     \n"

     "vabs.s16   q0, q0                         \n"

     "vqmovn.u16 d0, q0                         \n"

+    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely

     "bgt        1b                             \n"

   : "+r"(src_y0),      // %0

--- /dev/null

+++ b/third_party/libyuv/source/row_neon64.cc

@@ -1,0 +1,3323 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// This module is for GCC Neon

+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

+// Read 8 Y, 4 U and 4 V from 422

+#define READYUV422                                                             \

+    MEMACCESS(0)                                                               \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    MEMACCESS(1)                                                               \

+    "vld1.32    {d2[0]}, [%1]!                 \n"                             \

+    MEMACCESS(2)                                                               \

+    "vld1.32    {d2[1]}, [%2]!                 \n"

+// Read 8 Y, 2 U and 2 V from 422

+#define READYUV411                                                             \

+    MEMACCESS(0)                                                               \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    MEMACCESS(1)                                                               \

+    "vld1.16    {d2[0]}, [%1]!                 \n"                             \

+    MEMACCESS(2)                                                               \

+    "vld1.16    {d2[1]}, [%2]!                 \n"                             \

+    "vmov.u8    d3, d2                         \n"                             \

+    "vzip.u8    d2, d3                         \n"

+// Read 8 Y, 8 U and 8 V from 444

+#define READYUV444                                                             \

+    MEMACCESS(0)                                                               \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    MEMACCESS(1)                                                               \

+    "vld1.8     {d2}, [%1]!                    \n"                             \

+    MEMACCESS(2)                                                               \

+    "vld1.8     {d3}, [%2]!                    \n"                             \

+    "vpaddl.u8  q1, q1                         \n"                             \

+    "vrshrn.u16 d2, q1, #1                     \n"

+// Read 8 Y, and set 4 U and 4 V to 128

+#define READYUV400                                                             \

+    MEMACCESS(0)                                                               \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    "vmov.u8    d2, #128                       \n"

+// Read 8 Y and 4 UV from NV12

+#define READNV12                                                               \

+    MEMACCESS(0)                                                               \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    MEMACCESS(1)                                                               \

+    "vld1.8     {d2}, [%1]!                    \n"                             \

+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\

+    "vuzp.u8    d2, d3                         \n"                             \

+    "vtrn.u32   d2, d3                         \n"

+// Read 8 Y and 4 VU from NV21

+#define READNV21                                                               \

+    MEMACCESS(0)                                                               \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    MEMACCESS(1)                                                               \

+    "vld1.8     {d2}, [%1]!                    \n"                             \

+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\

+    "vuzp.u8    d3, d2                         \n"                             \

+    "vtrn.u32   d2, d3                         \n"

+// Read 8 YUY2

+#define READYUY2                                                               \

+    MEMACCESS(0)                                                               \

+    "vld2.8     {d0, d2}, [%0]!                \n"                             \

+    "vmov.u8    d3, d2                         \n"                             \

+    "vuzp.u8    d2, d3                         \n"                             \

+    "vtrn.u32   d2, d3                         \n"

+// Read 8 UYVY

+#define READUYVY                                                               \

+    MEMACCESS(0)                                                               \

+    "vld2.8     {d2, d3}, [%0]!                \n"                             \

+    "vmov.u8    d0, d3                         \n"                             \

+    "vmov.u8    d3, d2                         \n"                             \

+    "vuzp.u8    d2, d3                         \n"                             \

+    "vtrn.u32   d2, d3                         \n"

+#define YUV422TORGB                                                            \

+    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\

+    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\

+    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\

+    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\

+    "vtrn.u8    d0, d1                         \n"                             \

+    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\

+    "vmul.s16   q0, q0, q14                    \n"                             \

+    "vadd.s16   d18, d19                       \n"                             \

+    "vqadd.s16  d20, d0, d16                   \n" /* B */                     \

+    "vqadd.s16  d21, d1, d16                   \n"                             \

+    "vqadd.s16  d22, d0, d17                   \n" /* R */                     \

+    "vqadd.s16  d23, d1, d17                   \n"                             \

+    "vqadd.s16  d16, d0, d18                   \n" /* G */                     \

+    "vqadd.s16  d17, d1, d18                   \n"                             \

+    "vqshrun.s16 d0, q10, #6                   \n" /* B */                     \

+    "vqshrun.s16 d1, q11, #6                   \n" /* G */                     \

+    "vqshrun.s16 d2, q8, #6                    \n" /* R */                     \

+    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\

+    "vmovl.u8   q11, d1                        \n"                             \

+    "vmovl.u8   q8, d2                         \n"                             \

+    "vtrn.u8    d20, d21                       \n"                             \

+    "vtrn.u8    d22, d23                       \n"                             \

+    "vtrn.u8    d16, d17                       \n"                             \

+    "vmov.u8    d21, d16                       \n"

+static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,

+                         0, 0, 0, 0, 0, 0, 0, 0 };

+static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,

+                       0, 0, 0, 0, 0, 0, 0, 0 };

+#ifdef HAS_I444TOARGBROW_NEON

+void I444ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV444

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    MEMACCESS(3)

+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_argb),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I444TOARGBROW_NEON

+#ifdef HAS_I422TOARGBROW_NEON

+void I422ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    MEMACCESS(3)

+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_argb),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I422TOARGBROW_NEON

+#ifdef HAS_I411TOARGBROW_NEON

+void I411ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV411

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    MEMACCESS(3)

+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_argb),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I411TOARGBROW_NEON

+#ifdef HAS_I422TOBGRAROW_NEON

+void I422ToBGRARow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_bgra,

+                        int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vswp.u8    d20, d22                       \n"

+    "vmov.u8    d19, #255                      \n"

+    MEMACCESS(3)

+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_bgra),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I422TOBGRAROW_NEON

+#ifdef HAS_I422TOABGRROW_NEON

+void I422ToABGRRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_abgr,

+                        int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vswp.u8    d20, d22                       \n"

+    "vmov.u8    d23, #255                      \n"

+    MEMACCESS(3)

+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_abgr),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I422TOABGRROW_NEON

+#ifdef HAS_I422TORGBAROW_NEON

+void I422ToRGBARow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_rgba,

+                        int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d19, #255                      \n"

+    MEMACCESS(3)

+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_rgba),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I422TORGBAROW_NEON

+#ifdef HAS_I422TORGB24ROW_NEON

+void I422ToRGB24Row_NEON(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_rgb24,

+                         int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    MEMACCESS(3)

+    "vst3.8     {d20, d21, d22}, [%3]!         \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),      // %0

+      "+r"(src_u),      // %1

+      "+r"(src_v),      // %2

+      "+r"(dst_rgb24),  // %3

+      "+r"(width)       // %4

+    : "r"(&kUVToRB),    // %5

+      "r"(&kUVToG)      // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I422TORGB24ROW_NEON

+#ifdef HAS_I422TORAWROW_NEON

+void I422ToRAWRow_NEON(const uint8* src_y,

+                       const uint8* src_u,

+                       const uint8* src_v,

+                       uint8* dst_raw,

+                       int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vswp.u8    d20, d22                       \n"

+    MEMACCESS(3)

+    "vst3.8     {d20, d21, d22}, [%3]!         \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),    // %0

+      "+r"(src_u),    // %1

+      "+r"(src_v),    // %2

+      "+r"(dst_raw),  // %3

+      "+r"(width)     // %4

+    : "r"(&kUVToRB),  // %5

+      "r"(&kUVToG)    // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I422TORAWROW_NEON

+#define ARGBTORGB565                                                           \

+    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \

+    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \

+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \

+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \

+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \

+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \

+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \

+    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \

+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \

+    "vorr       q0, q0, q10                    \n"  /* BGR                  */

+#ifdef HAS_I422TORGB565ROW_NEON

+void I422ToRGB565Row_NEON(const uint8* src_y,

+                          const uint8* src_u,

+                          const uint8* src_v,

+                          uint8* dst_rgb565,

+                          int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    ARGBTORGB565

+    MEMACCESS(3)

+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.

+    "bgt        1b                             \n"

+    : "+r"(src_y),    // %0

+      "+r"(src_u),    // %1

+      "+r"(src_v),    // %2

+      "+r"(dst_rgb565),  // %3

+      "+r"(width)     // %4

+    : "r"(&kUVToRB),  // %5

+      "r"(&kUVToG)    // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I422TORGB565ROW_NEON

+#define ARGBTOARGB1555                                                         \

+    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \

+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \

+    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \

+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \

+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \

+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \

+    "vmovl.u8   q11, d23                       \n"  /* A                    */ \

+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \

+    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \

+    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \

+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \

+    "vorr       q1, q10, q11                   \n"  /* RA                   */ \

+    "vorr       q0, q0, q1                     \n"  /* BGRA                 */

+#ifdef HAS_I422TOARGB1555ROW_NEON

+void I422ToARGB1555Row_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb1555,

+                            int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    ARGBTOARGB1555

+    MEMACCESS(3)

+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.

+    "bgt        1b                             \n"

+    : "+r"(src_y),    // %0

+      "+r"(src_u),    // %1

+      "+r"(src_v),    // %2

+      "+r"(dst_argb1555),  // %3

+      "+r"(width)     // %4

+    : "r"(&kUVToRB),  // %5

+      "r"(&kUVToG)    // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I422TOARGB1555ROW_NEON

+#define ARGBTOARGB4444                                                         \

+    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \

+    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \

+    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \

+    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \

+    "vorr       d0, d20, d21                   \n"  /* BG                   */ \

+    "vorr       d1, d22, d23                   \n"  /* RA                   */ \

+    "vzip.u8    d0, d1                         \n"  /* BGRA                 */

+#ifdef HAS_I422TOARGB4444ROW_NEON

+void I422ToARGB4444Row_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb4444,

+                            int width) {

+  asm volatile (

+    MEMACCESS(5)

+    "vld1.8     {d24}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    ARGBTOARGB4444

+    MEMACCESS(3)

+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.

+    "bgt        1b                             \n"

+    : "+r"(src_y),    // %0

+      "+r"(src_u),    // %1

+      "+r"(src_v),    // %2

+      "+r"(dst_argb4444),  // %3

+      "+r"(width)     // %4

+    : "r"(&kUVToRB),  // %5

+      "r"(&kUVToG)    // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_I422TOARGB4444ROW_NEON

+#ifdef HAS_YTOARGBROW_NEON

+void YToARGBRow_NEON(const uint8* src_y,

+                     uint8* dst_argb,

+                     int width) {

+  asm volatile (

+    MEMACCESS(3)

+    "vld1.8     {d24}, [%3]                    \n"

+    MEMACCESS(4)

+    "vld1.8     {d25}, [%4]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV400

+    YUV422TORGB

+    "subs       %2, %2, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    MEMACCESS(1)

+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(dst_argb),  // %1

+      "+r"(width)      // %2

+    : "r"(&kUVToRB),   // %3

+      "r"(&kUVToG)     // %4

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_YTOARGBROW_NEON

+#ifdef HAS_I400TOARGBROW_NEON

+void I400ToARGBRow_NEON(const uint8* src_y,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+    "vmov.u8    d23, #255                      \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {d20}, [%0]!                   \n"

+    "vmov       d21, d20                       \n"

+    "vmov       d22, d20                       \n"

+    "subs       %2, %2, #8                     \n"

+    MEMACCESS(1)

+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(dst_argb),  // %1

+      "+r"(width)      // %2

+    :

+    : "cc", "memory", "d20", "d21", "d22", "d23"

+  );

+}

+#endif  // HAS_I400TOARGBROW_NEON

+#ifdef HAS_NV12TOARGBROW_NEON

+void NV12ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_uv,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    MEMACCESS(4)

+    "vld1.8     {d24}, [%4]                    \n"

+    MEMACCESS(5)

+    "vld1.8     {d25}, [%5]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READNV12

+    YUV422TORGB

+    "subs       %3, %3, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    MEMACCESS(2)

+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_uv),    // %1

+      "+r"(dst_argb),  // %2

+      "+r"(width)      // %3

+    : "r"(&kUVToRB),   // %4

+      "r"(&kUVToG)     // %5

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_NV12TOARGBROW_NEON

+#ifdef HAS_NV21TOARGBROW_NEON

+void NV21ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_uv,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    MEMACCESS(4)

+    "vld1.8     {d24}, [%4]                    \n"

+    MEMACCESS(5)

+    "vld1.8     {d25}, [%5]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READNV21

+    YUV422TORGB

+    "subs       %3, %3, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    MEMACCESS(2)

+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_uv),    // %1

+      "+r"(dst_argb),  // %2

+      "+r"(width)      // %3

+    : "r"(&kUVToRB),   // %4

+      "r"(&kUVToG)     // %5

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_NV21TOARGBROW_NEON

+#ifdef HAS_NV12TORGB565ROW_NEON

+void NV12ToRGB565Row_NEON(const uint8* src_y,

+                          const uint8* src_uv,

+                          uint8* dst_rgb565,

+                          int width) {

+  asm volatile (

+    MEMACCESS(4)

+    "vld1.8     {d24}, [%4]                    \n"

+    MEMACCESS(5)

+    "vld1.8     {d25}, [%5]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READNV12

+    YUV422TORGB

+    "subs       %3, %3, #8                     \n"

+    ARGBTORGB565

+    MEMACCESS(2)

+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_uv),    // %1

+      "+r"(dst_rgb565),  // %2

+      "+r"(width)      // %3

+    : "r"(&kUVToRB),   // %4

+      "r"(&kUVToG)     // %5

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_NV12TORGB565ROW_NEON

+#ifdef HAS_NV21TORGB565ROW_NEON

+void NV21ToRGB565Row_NEON(const uint8* src_y,

+                          const uint8* src_uv,

+                          uint8* dst_rgb565,

+                          int width) {

+  asm volatile (

+    MEMACCESS(4)

+    "vld1.8     {d24}, [%4]                    \n"

+    MEMACCESS(5)

+    "vld1.8     {d25}, [%5]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READNV21

+    YUV422TORGB

+    "subs       %3, %3, #8                     \n"

+    ARGBTORGB565

+    MEMACCESS(2)

+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_uv),    // %1

+      "+r"(dst_rgb565),  // %2

+      "+r"(width)      // %3

+    : "r"(&kUVToRB),   // %4

+      "r"(&kUVToG)     // %5

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_NV21TORGB565ROW_NEON

+#ifdef HAS_YUY2TOARGBROW_NEON

+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    MEMACCESS(3)

+    "vld1.8     {d24}, [%3]                    \n"

+    MEMACCESS(4)

+    "vld1.8     {d25}, [%4]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUY2

+    YUV422TORGB

+    "subs       %2, %2, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    MEMACCESS(1)

+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_yuy2),  // %0

+      "+r"(dst_argb),  // %1

+      "+r"(width)      // %2

+    : "r"(&kUVToRB),   // %3

+      "r"(&kUVToG)     // %4

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_YUY2TOARGBROW_NEON

+#ifdef HAS_UYVYTOARGBROW_NEON

+void UYVYToARGBRow_NEON(const uint8* src_uyvy,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    MEMACCESS(3)

+    "vld1.8     {d24}, [%3]                    \n"

+    MEMACCESS(4)

+    "vld1.8     {d25}, [%4]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READUYVY

+    YUV422TORGB

+    "subs       %2, %2, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    MEMACCESS(1)

+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_uyvy),  // %0

+      "+r"(dst_argb),  // %1

+      "+r"(width)      // %2

+    : "r"(&kUVToRB),   // %3

+      "r"(&kUVToG)     // %4

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_UYVYTOARGBROW_NEON

+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.

+#ifdef HAS_SPLITUVROW_NEON

+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop

+    MEMACCESS(1)

+    "vst1.8     {q0}, [%1]!                    \n"  // store U

+    MEMACCESS(2)

+    "vst1.8     {q1}, [%2]!                    \n"  // store V

+    "bgt        1b                             \n"

+    : "+r"(src_uv),  // %0

+      "+r"(dst_u),   // %1

+      "+r"(dst_v),   // %2

+      "+r"(width)    // %3  // Output registers

+    :                       // Input registers

+    : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+#endif  // HAS_SPLITUVROW_NEON

+// Reads 16 U's and V's and writes out 16 pairs of UV.

+#ifdef HAS_MERGEUVROW_NEON

+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                     int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load U

+    MEMACCESS(1)

+    "vld1.8     {q1}, [%1]!                    \n"  // load V

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop

+    MEMACCESS(2)

+    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV

+    "bgt        1b                             \n"

+    :

+      "+r"(src_u),   // %0

+      "+r"(src_v),   // %1

+      "+r"(dst_uv),  // %2

+      "+r"(width)    // %3  // Output registers

+    :                       // Input registers

+    : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+#endif  // HAS_MERGEUVROW_NEON

+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.

+#ifdef HAS_COPYROW_NEON

+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32

+    "subs       %2, %2, #32                    \n"  // 32 processed per loop

+    MEMACCESS(1)

+    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32

+    "bgt        1b                             \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(count)  // %2  // Output registers

+  :                     // Input registers

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+#endif  // HAS_COPYROW_NEON

+// SetRow8 writes 'count' bytes using a 32 bit value repeated.

+#ifdef HAS_SETROW_NEON

+void SetRow_NEON(uint8* dst, uint32 v32, int count) {

+  asm volatile (

+    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints

+    "1:                                        \n"

+    "subs      %1, %1, #16                     \n"  // 16 bytes per loop

+    MEMACCESS(0)

+    "vst1.8    {q0}, [%0]!                     \n"  // store

+    "bgt       1b                              \n"

+  : "+r"(dst),   // %0

+    "+r"(count)  // %1

+  : "r"(v32)     // %2

+  : "cc", "memory", "q0"

+  );

+}

+#endif  // HAS_SETROW_NEON

+// TODO(fbarchard): Make fully assembler

+// SetRow32 writes 'count' words using a 32 bit value repeated.

+#ifdef HAS_ARGBSETROWS_NEON

+void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,

+                      int dst_stride, int height) {

+  for (int y = 0; y < height; ++y) {

+    SetRow_NEON(dst, v32, width << 2);

+    dst += dst_stride;

+  }

+}

+#endif  // HAS_ARGBSETROWS_NEON

+#ifdef HAS_MIRRORROW_NEON

+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {

+  asm volatile (

+    // Start at end of source row.

+    "mov        r3, #-16                       \n"

+    "add        %0, %0, %2                     \n"

+    "sub        %0, #16                        \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

+    "subs       %2, #16                        \n"  // 16 pixels per loop.

+    "vrev64.8   q0, q0                         \n"

+    MEMACCESS(1)

+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(width)  // %2

+  :

+  : "cc", "memory", "r3", "q0"

+  );

+}

+#endif  // HAS_MIRRORROW_NEON

+#ifdef HAS_MIRRORUVROW_NEON

+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                      int width) {

+  asm volatile (

+    // Start at end of source row.

+    "mov        r12, #-16                      \n"

+    "add        %0, %0, %3, lsl #1             \n"

+    "sub        %0, #16                        \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16

+    "subs       %3, #8                         \n"  // 8 pixels per loop.

+    "vrev64.8   q0, q0                         \n"

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8

+    MEMACCESS(2)

+    "vst1.8     {d1}, [%2]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src_uv),  // %0

+    "+r"(dst_u),   // %1

+    "+r"(dst_v),   // %2

+    "+r"(width)    // %3

+  :

+  : "cc", "memory", "r12", "q0"

+  );

+}

+#endif  // HAS_MIRRORUVROW_NEON

+#ifdef HAS_ARGBMIRRORROW_NEON

+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {

+  asm volatile (

+    // Start at end of source row.

+    "mov        r3, #-16                       \n"

+    "add        %0, %0, %2, lsl #2             \n"

+    "sub        %0, #16                        \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

+    "subs       %2, #4                         \n"  // 4 pixels per loop.

+    "vrev64.32  q0, q0                         \n"

+    MEMACCESS(1)

+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(width)  // %2

+  :

+  : "cc", "memory", "r3", "q0"

+  );

+}

+#endif  // HAS_ARGBMIRRORROW_NEON

+#ifdef HAS_RGB24TOARGBROW_NEON

+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #255                       \n"  // Alpha

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    MEMACCESS(1)

+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb24),  // %0

+    "+r"(dst_argb),   // %1

+    "+r"(pix)         // %2

+  :

+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+  );

+}

+#endif  // HAS_RGB24TOARGBROW_NEON

+#ifdef HAS_RAWTOARGBROW_NEON

+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #255                       \n"  // Alpha

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vswp.u8    d1, d3                         \n"  // swap R, B

+    MEMACCESS(1)

+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_raw),   // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+  );

+}

+#endif  // HAS_RAWTOARGBROW_NEON

+#define RGB565TOARGB                                                           \

+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \

+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \

+    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \

+    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \

+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \

+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \

+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \

+    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \

+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \

+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */

+#ifdef HAS_RGB565TOARGBROW_NEON

+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {

+  asm volatile (

+    "vmov.u8    d3, #255                       \n"  // Alpha

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    RGB565TOARGB

+    MEMACCESS(1)

+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb565),  // %0

+    "+r"(dst_argb),    // %1

+    "+r"(pix)          // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

+  );

+}

+#endif  // HAS_RGB565TOARGBROW_NEON

+#define ARGB1555TOARGB                                                         \

+    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \

+    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \

+    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \

+    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \

+    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \

+    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \

+    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \

+    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \

+    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \

+    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \

+    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \

+    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \

+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.

+#define RGB555TOARGB                                                           \

+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \

+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \

+    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \

+    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \

+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \

+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \

+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \

+    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \

+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \

+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */

+#ifdef HAS_ARGB1555TOARGBROW_NEON

+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,

+                            int pix) {

+  asm volatile (

+    "vmov.u8    d3, #255                       \n"  // Alpha

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGB1555TOARGB

+    MEMACCESS(1)

+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_argb1555),  // %0

+    "+r"(dst_argb),    // %1

+    "+r"(pix)          // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

+  );

+}

+#endif  // HAS_ARGB1555TOARGBROW_NEON

+#define ARGB4444TOARGB                                                         \

+    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \

+    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \

+    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \

+    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \

+    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \

+    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \

+    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \

+    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */

+#ifdef HAS_ARGB4444TOARGBROW_NEON

+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,

+                            int pix) {

+  asm volatile (

+    "vmov.u8    d3, #255                       \n"  // Alpha

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGB4444TOARGB

+    MEMACCESS(1)

+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_argb4444),  // %0

+    "+r"(dst_argb),    // %1

+    "+r"(pix)          // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List

+  );

+}

+#endif  // HAS_ARGB4444TOARGBROW_NEON

+#ifdef HAS_ARGBTORGB24ROW_NEON

+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    MEMACCESS(1)

+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_rgb24),  // %1

+    "+r"(pix)         // %2

+  :

+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+  );

+}

+#endif  // HAS_ARGBTORGB24ROW_NEON

+#ifdef HAS_ARGBTORAWROW_NEON

+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vswp.u8    d1, d3                         \n"  // swap R, B

+    MEMACCESS(1)

+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_raw),   // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+  );

+}

+#endif  // HAS_ARGBTORAWROW_NEON

+#ifdef HAS_YUY2TOYROW_NEON

+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.

+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.

+    MEMACCESS(1)

+    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.

+    "bgt        1b                             \n"

+  : "+r"(src_yuy2),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+#endif  // HAS_YUY2TOYROW_NEON

+#ifdef HAS_UYVYTOYROW_NEON

+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.

+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.

+    MEMACCESS(1)

+    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.

+    "bgt        1b                             \n"

+  : "+r"(src_uyvy),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+#endif  // HAS_UYVYTOYROW_NEON

+#ifdef HAS_YUY2TOUV422ROW_NEON

+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,

+                         int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.

+    MEMACCESS(1)

+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.

+    MEMACCESS(2)

+    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.

+    "bgt        1b                             \n"

+  : "+r"(src_yuy2),  // %0

+    "+r"(dst_u),     // %1

+    "+r"(dst_v),     // %2

+    "+r"(pix)        // %3

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List

+  );

+}

+#endif  // HAS_YUY2TOUV422ROW_NEON

+#ifdef HAS_UYVYTOUV422ROW_NEON

+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,

+                         int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.

+    MEMACCESS(2)

+    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.

+    "bgt        1b                             \n"

+  : "+r"(src_uyvy),  // %0

+    "+r"(dst_u),     // %1

+    "+r"(dst_v),     // %2

+    "+r"(pix)        // %3

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List

+  );

+}

+#endif  // HAS_UYVYTOUV422ROW_NEON

+#ifdef HAS_YUY2TOUVROW_NEON

+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // stride + src_yuy2

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.

+    MEMACCESS(1)

+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.

+    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U

+    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V

+    MEMACCESS(2)

+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.

+    MEMACCESS(3)

+    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.

+    "bgt        1b                             \n"

+  : "+r"(src_yuy2),     // %0

+    "+r"(stride_yuy2),  // %1

+    "+r"(dst_u),        // %2

+    "+r"(dst_v),        // %3

+    "+r"(pix)           // %4

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List

+  );

+}

+#endif  // HAS_YUY2TOUVROW_NEON

+#ifdef HAS_UYVYTOUVROW_NEON

+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // stride + src_uyvy

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.

+    MEMACCESS(1)

+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.

+    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U

+    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.

+    MEMACCESS(3)

+    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.

+    "bgt        1b                             \n"

+  : "+r"(src_uyvy),     // %0

+    "+r"(stride_uyvy),  // %1

+    "+r"(dst_u),        // %2

+    "+r"(dst_v),        // %3

+    "+r"(pix)           // %4

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List

+  );

+}

+#endif  // HAS_UYVYTOUVROW_NEON

+#ifdef HAS_HALFROW_NEON

+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,

+                  uint8* dst_uv, int pix) {

+  asm volatile (

+    // change the stride to row 2 pointer

+    "add        %1, %0                         \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop

+    MEMACCESS(1)

+    "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.

+    "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2

+    MEMACCESS(2)

+    "vst1.8     {q0}, [%2]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src_uv),         // %0

+    "+r"(src_uv_stride),  // %1

+    "+r"(dst_uv),         // %2

+    "+r"(pix)             // %3

+  :

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+#endif  // HAS_HALFROW_NEON

+// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG

+#ifdef HAS_ARGBTOBAYERROW_NEON

+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,

+                         uint32 selector, int pix) {

+  asm volatile (

+    "vmov.u32   d6[0], %3                      \n"  // selector

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop

+    "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels

+    "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels

+    "vtrn.u32   d4, d5                         \n"  // combine 8 pixels

+    MEMACCESS(1)

+    "vst1.8     {d4}, [%1]!                    \n"  // store 8.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_bayer),  // %1

+    "+r"(pix)         // %2

+  : "r"(selector)     // %3

+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

+  );

+}

+#endif  // HAS_ARGBTOBAYERROW_NEON

+// Select G channels from ARGB.  e.g.  GGGGGGGG

+#ifdef HAS_ARGBTOBAYERGGROW_NEON

+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,

+                           uint32 /*selector*/, int pix) {

+  asm volatile (

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop

+    MEMACCESS(1)

+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_bayer),  // %1

+    "+r"(pix)         // %2

+  :

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+#endif  // HAS_ARGBTOBAYERGGROW_NEON

+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

+#ifdef HAS_ARGBSHUFFLEROW_NEON

+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,

+                         const uint8* shuffler, int pix) {

+  asm volatile (

+    MEMACCESS(3)

+    "vld1.8     {q2}, [%3]                     \n"  // shuffler

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.

+    "subs       %2, %2, #4                     \n"  // 4 processed per loop

+    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels

+    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels

+    MEMACCESS(1)

+    "vst1.8     {q1}, [%1]!                    \n"  // store 4.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  : "r"(shuffler)    // %3

+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List

+  );

+}

+#endif  // HAS_ARGBSHUFFLEROW_NEON

+#ifdef HAS_I422TOYUY2ROW_NEON

+void I422ToYUY2Row_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_yuy2, int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys

+    MEMACCESS(1)

+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us

+    MEMACCESS(2)

+    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs

+    "subs       %4, %4, #16                    \n"  // 16 pixels

+    MEMACCESS(3)

+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_y),     // %0

+    "+r"(src_u),     // %1

+    "+r"(src_v),     // %2

+    "+r"(dst_yuy2),  // %3

+    "+r"(width)      // %4

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3"

+  );

+}

+#endif  // HAS_I422TOYUY2ROW_NEON

+#ifdef HAS_I422TOUYVYROW_NEON

+void I422ToUYVYRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_uyvy, int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys

+    MEMACCESS(1)

+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us

+    MEMACCESS(2)

+    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs

+    "subs       %4, %4, #16                    \n"  // 16 pixels

+    MEMACCESS(3)

+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_y),     // %0

+    "+r"(src_u),     // %1

+    "+r"(src_v),     // %2

+    "+r"(dst_uyvy),  // %3

+    "+r"(width)      // %4

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3"

+  );

+}

+#endif  // HAS_I422TOUYVYROW_NEON

+#ifdef HAS_ARGBTORGB565ROW_NEON

+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGBTORGB565

+    MEMACCESS(1)

+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_rgb565),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

+  );

+}

+#endif  // HAS_ARGBTORGB565ROW_NEON

+#ifdef HAS_ARGBTOARGB1555ROW_NEON

+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,

+                            int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGBTOARGB1555

+    MEMACCESS(1)

+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb1555),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

+  );

+}

+#endif  // HAS_ARGBTOARGB1555ROW_NEON

+#ifdef HAS_ARGBTOARGB4444ROW_NEON

+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,

+                            int pix) {

+  asm volatile (

+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGBTOARGB4444

+    MEMACCESS(1)

+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),      // %0

+    "+r"(dst_argb4444),  // %1

+    "+r"(pix)            // %2

+  :

+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

+  );

+}

+#endif  // HAS_ARGBTOARGB4444ROW_NEON

+#ifdef HAS_ARGBTOYROW_NEON

+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+    "vmov.u8    d27, #16                       \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d27                        \n"

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

+  );

+}

+#endif  // HAS_ARGBTOYROW_NEON

+#ifdef HAS_ARGBTOYJROW_NEON

+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient

+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient

+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

+  );

+}

+#endif  // HAS_ARGBTOYJROW_NEON

+// 8x1 pixels.

+#ifdef HAS_ARGBTOUV444ROW_NEON

+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                         int pix) {

+  asm volatile (

+    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient

+    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient

+    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient

+    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient

+    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlsl.u8   q2, d1, d25                    \n"  // G

+    "vmlsl.u8   q2, d2, d26                    \n"  // R

+    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned

+    "vmull.u8   q3, d2, d24                    \n"  // R

+    "vmlsl.u8   q3, d1, d28                    \n"  // G

+    "vmlsl.u8   q3, d0, d27                    \n"  // B

+    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

+    MEMACCESS(2)

+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_u),     // %1

+    "+r"(dst_v),     // %2

+    "+r"(pix)        // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_ARGBTOUV444ROW_NEON

+// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+#ifdef HAS_ARGBTOUV422ROW_NEON

+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                         int pix) {

+  asm volatile (

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(0)

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.

+    "vmul.s16   q8, q0, q10                    \n"  // B

+    "vmls.s16   q8, q1, q11                    \n"  // G

+    "vmls.s16   q8, q2, q12                    \n"  // R

+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+    "vmul.s16   q9, q2, q10                    \n"  // R

+    "vmls.s16   q9, q1, q14                    \n"  // G

+    "vmls.s16   q9, q0, q13                    \n"  // B

+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

+    MEMACCESS(2)

+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_u),     // %1

+    "+r"(dst_v),     // %2

+    "+r"(pix)        // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_ARGBTOUV422ROW_NEON

+// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.

+#ifdef HAS_ARGBTOUV411ROW_NEON

+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                         int pix) {

+  asm volatile (

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(0)

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(0)

+    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.

+    MEMACCESS(0)

+    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.

+    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.

+    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.

+    "vpadd.u16  d1, d8, d9                     \n"  // B

+    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.

+    "vpadd.u16  d3, d10, d11                   \n"  // G

+    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.

+    "vpadd.u16  d5, d12, d13                   \n"  // R

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %3, %3, #32                    \n"  // 32 processed per loop.

+    "vmul.s16   q8, q0, q10                    \n"  // B

+    "vmls.s16   q8, q1, q11                    \n"  // G

+    "vmls.s16   q8, q2, q12                    \n"  // R

+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+    "vmul.s16   q9, q2, q10                    \n"  // R

+    "vmls.s16   q9, q1, q14                    \n"  // G

+    "vmls.s16   q9, q0, q13                    \n"  // B

+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

+    MEMACCESS(2)

+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_u),     // %1

+    "+r"(dst_v),     // %2

+    "+r"(pix)        // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_ARGBTOUV411ROW_NEON

+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+#define RGBTOUV(QB, QG, QR) \

+    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \

+    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \

+    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \

+    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \

+    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \

+    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \

+    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \

+    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \

+    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \

+    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */

+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.

+#ifdef HAS_ARGBTOUVROW_NEON

+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_argb

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(0)

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.

+    MEMACCESS(1)

+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.

+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q0, q1, q2)

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(src_stride_argb),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_ARGBTOUVROW_NEON

+// TODO(fbarchard): Subsample match C code.

+#ifdef HAS_ARGBTOUVJROW_NEON

+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_argb

+    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient

+    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient

+    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient

+    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient

+    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(0)

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.

+    MEMACCESS(1)

+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.

+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q0, q1, q2)

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(src_stride_argb),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_ARGBTOUVJROW_NEON

+#ifdef HAS_BGRATOUVROW_NEON

+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_bgra

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.

+    MEMACCESS(0)

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.

+    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.

+    MEMACCESS(1)

+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.

+    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q1, q1, #1                     \n"  // 2x average

+    "vrshr.u16  q2, q2, #1                     \n"

+    "vrshr.u16  q3, q3, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q3, q2, q1)

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_bgra),  // %0

+    "+r"(src_stride_bgra),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_BGRATOUVROW_NEON

+#ifdef HAS_ABGRTOUVROW_NEON

+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_abgr

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.

+    MEMACCESS(0)

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.

+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.

+    MEMACCESS(1)

+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.

+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q2, q1, q0)

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_abgr),  // %0

+    "+r"(src_stride_abgr),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_ABGRTOUVROW_NEON

+#ifdef HAS_RGBATOUVROW_NEON

+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_rgba

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.

+    MEMACCESS(0)

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.

+    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.

+    MEMACCESS(1)

+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.

+    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q0, q1, q2)

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_rgba),  // %0

+    "+r"(src_stride_rgba),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_RGBATOUVROW_NEON

+#ifdef HAS_RGB24TOUVROW_NEON

+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,

+                       uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.

+    MEMACCESS(0)

+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.

+    MEMACCESS(1)

+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.

+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q0, q1, q2)

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb24),  // %0

+    "+r"(src_stride_rgb24),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_RGB24TOUVROW_NEON

+#ifdef HAS_RAWTOUVROW_NEON

+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,

+                     uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_raw

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.

+    MEMACCESS(0)

+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.

+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.

+    MEMACCESS(1)

+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.

+    MEMACCESS(1)

+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.

+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q2, q1, q0)

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_raw),  // %0

+    "+r"(src_stride_raw),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_RAWTOUVROW_NEON

+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+#ifdef HAS_RGB565TOUVROW_NEON

+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,

+                        uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_argb

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

+    RGB565TOARGB

+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.

+    RGB565TOARGB

+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.

+    RGB565TOARGB

+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.

+    RGB565TOARGB

+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average

+    "vrshr.u16  q5, q5, #1                     \n"

+    "vrshr.u16  q6, q6, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.

+    "vmul.s16   q8, q4, q10                    \n"  // B

+    "vmls.s16   q8, q5, q11                    \n"  // G

+    "vmls.s16   q8, q6, q12                    \n"  // R

+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+    "vmul.s16   q9, q6, q10                    \n"  // R

+    "vmls.s16   q9, q5, q14                    \n"  // G

+    "vmls.s16   q9, q4, q13                    \n"  // B

+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb565),  // %0

+    "+r"(src_stride_rgb565),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_RGB565TOUVROW_NEON

+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+#ifdef HAS_ARGB1555TOUVROW_NEON

+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,

+                        uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_argb

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

+    RGB555TOARGB

+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.

+    RGB555TOARGB

+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.

+    RGB555TOARGB

+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.

+    RGB555TOARGB

+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average

+    "vrshr.u16  q5, q5, #1                     \n"

+    "vrshr.u16  q6, q6, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.

+    "vmul.s16   q8, q4, q10                    \n"  // B

+    "vmls.s16   q8, q5, q11                    \n"  // G

+    "vmls.s16   q8, q6, q12                    \n"  // R

+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+    "vmul.s16   q9, q6, q10                    \n"  // R

+    "vmls.s16   q9, q5, q14                    \n"  // G

+    "vmls.s16   q9, q4, q13                    \n"  // B

+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb1555),  // %0

+    "+r"(src_stride_argb1555),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_ARGB1555TOUVROW_NEON

+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+#ifdef HAS_ARGB4444TOUVROW_NEON

+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,

+                          uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_argb

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

+    ARGB4444TOARGB

+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.

+    ARGB4444TOARGB

+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.

+    ARGB4444TOARGB

+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    MEMACCESS(1)

+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.

+    ARGB4444TOARGB

+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average

+    "vrshr.u16  q5, q5, #1                     \n"

+    "vrshr.u16  q6, q6, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.

+    "vmul.s16   q8, q4, q10                    \n"  // B

+    "vmls.s16   q8, q5, q11                    \n"  // G

+    "vmls.s16   q8, q6, q12                    \n"  // R

+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+    "vmul.s16   q9, q6, q10                    \n"  // R

+    "vmls.s16   q9, q5, q14                    \n"  // G

+    "vmls.s16   q9, q4, q13                    \n"  // B

+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    MEMACCESS(3)

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb4444),  // %0

+    "+r"(src_stride_argb4444),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_ARGB4444TOUVROW_NEON

+#ifdef HAS_RGB565TOYROW_NEON

+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+    "vmov.u8    d27, #16                       \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    RGB565TOARGB

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d27                        \n"

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb565),  // %0

+    "+r"(dst_y),       // %1

+    "+r"(pix)          // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

+  );

+}

+#endif  // HAS_RGB565TOYROW_NEON

+#ifdef HAS_ARGB1555TOYROW_NEON

+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+    "vmov.u8    d27, #16                       \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGB1555TOARGB

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d27                        \n"

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_argb1555),  // %0

+    "+r"(dst_y),         // %1

+    "+r"(pix)            // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

+  );

+}

+#endif  // HAS_ARGB1555TOYROW_NEON

+#ifdef HAS_ARGB4444TOYROW_NEON

+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+    "vmov.u8    d27, #16                       \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGB4444TOARGB

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d27                        \n"

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_argb4444),  // %0

+    "+r"(dst_y),         // %1

+    "+r"(pix)            // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

+  );

+}

+#endif  // HAS_ARGB4444TOYROW_NEON

+#ifdef HAS_BGRATOYROW_NEON

+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

+    "vmov.u8    d7, #16                        \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q8, d1, d4                     \n"  // R

+    "vmlal.u8   q8, d2, d5                     \n"  // G

+    "vmlal.u8   q8, d3, d6                     \n"  // B

+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d7                         \n"

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_bgra),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

+  );

+}

+#endif  // HAS_BGRATOYROW_NEON

+#ifdef HAS_ABGRTOYROW_NEON

+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

+    "vmov.u8    d7, #16                        \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q8, d0, d4                     \n"  // R

+    "vmlal.u8   q8, d1, d5                     \n"  // G

+    "vmlal.u8   q8, d2, d6                     \n"  // B

+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d7                         \n"

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_abgr),  // %0

+    "+r"(dst_y),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

+  );

+}

+#endif  // HAS_ABGRTOYROW_NEON

+#ifdef HAS_RGBATOYROW_NEON

+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient

+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient

+    "vmov.u8    d7, #16                        \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q8, d1, d4                     \n"  // B

+    "vmlal.u8   q8, d2, d5                     \n"  // G

+    "vmlal.u8   q8, d3, d6                     \n"  // R

+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d7                         \n"

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_rgba),  // %0

+    "+r"(dst_y),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

+  );

+}

+#endif  // HAS_RGBATOYROW_NEON

+#ifdef HAS_RGB24TOYROW_NEON

+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient

+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient

+    "vmov.u8    d7, #16                        \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q8, d0, d4                     \n"  // B

+    "vmlal.u8   q8, d1, d5                     \n"  // G

+    "vmlal.u8   q8, d2, d6                     \n"  // R

+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d7                         \n"

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb24),  // %0

+    "+r"(dst_y),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

+  );

+}

+#endif  // HAS_RGB24TOYROW_NEON

+#ifdef HAS_RAWTOYROW_NEON

+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

+    "vmov.u8    d7, #16                        \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q8, d0, d4                     \n"  // B

+    "vmlal.u8   q8, d1, d5                     \n"  // G

+    "vmlal.u8   q8, d2, d6                     \n"  // R

+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d7                         \n"

+    MEMACCESS(1)

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_raw),  // %0

+    "+r"(dst_y),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

+  );

+}

+#endif  // HAS_RAWTOYROW_NEON

+// Bilinear filter 16x2 -> 16x1

+#ifdef HAS_INTERPOLATEROW_NEON

+void InterpolateRow_NEON(uint8* dst_ptr,

+                         const uint8* src_ptr, ptrdiff_t src_stride,

+                         int dst_width, int source_y_fraction) {

+  asm volatile (

+    "cmp        %4, #0                         \n"

+    "beq        100f                           \n"

+    "add        %2, %1                         \n"

+    "cmp        %4, #64                        \n"

+    "beq        75f                            \n"

+    "cmp        %4, #128                       \n"

+    "beq        50f                            \n"

+    "cmp        %4, #192                       \n"

+    "beq        25f                            \n"

+    "vdup.8     d5, %4                         \n"

+    "rsb        %4, #256                       \n"

+    "vdup.8     d4, %4                         \n"

+    // General purpose row blend.

+  "1:                                          \n"

+    MEMACCESS(1)

+    "vld1.8     {q0}, [%1]!                    \n"

+    MEMACCESS(2)

+    "vld1.8     {q1}, [%2]!                    \n"

+    "subs       %3, %3, #16                    \n"

+    "vmull.u8   q13, d0, d4                    \n"

+    "vmull.u8   q14, d1, d4                    \n"

+    "vmlal.u8   q13, d2, d5                    \n"

+    "vmlal.u8   q14, d3, d5                    \n"

+    "vrshrn.u16 d0, q13, #8                    \n"

+    "vrshrn.u16 d1, q14, #8                    \n"

+    MEMACCESS(0)

+    "vst1.8     {q0}, [%0]!                    \n"

+    "bgt        1b                             \n"

+    "b          99f                            \n"

+    // Blend 25 / 75.

+  "25:                                         \n"

+    MEMACCESS(1)

+    "vld1.8     {q0}, [%1]!                    \n"

+    MEMACCESS(2)

+    "vld1.8     {q1}, [%2]!                    \n"

+    "subs       %3, %3, #16                    \n"

+    "vrhadd.u8  q0, q1                         \n"

+    "vrhadd.u8  q0, q1                         \n"

+    MEMACCESS(0)

+    "vst1.8     {q0}, [%0]!                    \n"

+    "bgt        25b                            \n"

+    "b          99f                            \n"

+    // Blend 50 / 50.

+  "50:                                         \n"

+    MEMACCESS(1)

+    "vld1.8     {q0}, [%1]!                    \n"

+    MEMACCESS(2)

+    "vld1.8     {q1}, [%2]!                    \n"

+    "subs       %3, %3, #16                    \n"

+    "vrhadd.u8  q0, q1                         \n"

+    MEMACCESS(0)

+    "vst1.8     {q0}, [%0]!                    \n"

+    "bgt        50b                            \n"

+    "b          99f                            \n"

+    // Blend 75 / 25.

+  "75:                                         \n"

+    MEMACCESS(1)

+    "vld1.8     {q1}, [%1]!                    \n"

+    MEMACCESS(2)

+    "vld1.8     {q0}, [%2]!                    \n"

+    "subs       %3, %3, #16                    \n"

+    "vrhadd.u8  q0, q1                         \n"

+    "vrhadd.u8  q0, q1                         \n"

+    MEMACCESS(0)

+    "vst1.8     {q0}, [%0]!                    \n"

+    "bgt        75b                            \n"

+    "b          99f                            \n"

+    // Blend 100 / 0 - Copy row unchanged.

+  "100:                                        \n"

+    MEMACCESS(1)

+    "vld1.8     {q0}, [%1]!                    \n"

+    "subs       %3, %3, #16                    \n"

+    MEMACCESS(0)

+    "vst1.8     {q0}, [%0]!                    \n"

+    "bgt        100b                           \n"

+  "99:                                         \n"

+  : "+r"(dst_ptr),          // %0

+    "+r"(src_ptr),          // %1

+    "+r"(src_stride),       // %2

+    "+r"(dst_width),        // %3

+    "+r"(source_y_fraction) // %4

+  :

+  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"

+  );

+}

+#endif  // HAS_INTERPOLATEROW_NEON

+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr

+#ifdef HAS_ARGBBLENDROW_NEON

+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

+                       uint8* dst_argb, int width) {

+  asm volatile (

+    "subs       %3, #8                         \n"

+    "blt        89f                            \n"

+    // Blend 8 pixels.

+  "8:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.

+    MEMACCESS(1)

+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q10, d4, d3                    \n"  // db * a

+    "vmull.u8   q11, d5, d3                    \n"  // dg * a

+    "vmull.u8   q12, d6, d3                    \n"  // dr * a

+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8

+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8

+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8

+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256

+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256

+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg

+    "vqadd.u8   d2, d2, d6                     \n"  // + sr

+    "vmov.u8    d3, #255                       \n"  // a = 255

+    MEMACCESS(2)

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.

+    "bge        8b                             \n"

+  "89:                                         \n"

+    "adds       %3, #8-1                       \n"

+    "blt        99f                            \n"

+    // Blend 1 pixels.

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.

+    MEMACCESS(1)

+    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.

+    "subs       %3, %3, #1                     \n"  // 1 processed per loop.

+    "vmull.u8   q10, d4, d3                    \n"  // db * a

+    "vmull.u8   q11, d5, d3                    \n"  // dg * a

+    "vmull.u8   q12, d6, d3                    \n"  // dr * a

+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8

+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8

+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8

+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256

+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256

+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg

+    "vqadd.u8   d2, d2, d6                     \n"  // + sr

+    "vmov.u8    d3, #255                       \n"  // a = 255

+    MEMACCESS(2)

+    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.

+    "bge        1b                             \n"

+  "99:                                         \n"

+  : "+r"(src_argb0),    // %0

+    "+r"(src_argb1),    // %1

+    "+r"(dst_argb),     // %2

+    "+r"(width)         // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"

+  );

+}

+#endif  // HAS_ARGBBLENDROW_NEON

+// Attenuate 8 pixels at a time.

+#ifdef HAS_ARGBATTENUATEROW_NEON

+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {

+  asm volatile (

+    // Attenuate 8 pixels.

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q10, d0, d3                    \n"  // b * a

+    "vmull.u8   q11, d1, d3                    \n"  // g * a

+    "vmull.u8   q12, d2, d3                    \n"  // r * a

+    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8

+    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8

+    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8

+    MEMACCESS(1)

+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_argb),   // %1

+    "+r"(width)       // %2

+  :

+  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"

+  );

+}

+#endif  // HAS_ARGBATTENUATEROW_NEON

+// Quantize 8 ARGB pixels (32 bytes).

+// dst = (dst * scale >> 16) * interval_size + interval_offset;

+#ifdef HAS_ARGBQUANTIZEROW_NEON

+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,

+                          int interval_offset, int width) {

+  asm volatile (

+    "vdup.u16   q8, %2                         \n"

+    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1

+    "vdup.u16   q9, %3                         \n"  // interval multiply.

+    "vdup.u16   q10, %4                        \n"  // interval add

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.

+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.

+    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)

+    "vmovl.u8   q1, d2                         \n"

+    "vmovl.u8   q2, d4                         \n"

+    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale

+    "vqdmulh.s16 q1, q1, q8                    \n"  // g

+    "vqdmulh.s16 q2, q2, q8                    \n"  // r

+    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size

+    "vmul.u16   q1, q1, q9                     \n"  // g

+    "vmul.u16   q2, q2, q9                     \n"  // r

+    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset

+    "vadd.u16   q1, q1, q10                    \n"  // g

+    "vadd.u16   q2, q2, q10                    \n"  // r

+    "vqmovn.u16 d0, q0                         \n"

+    "vqmovn.u16 d2, q1                         \n"

+    "vqmovn.u16 d4, q2                         \n"

+    MEMACCESS(0)

+    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(dst_argb),       // %0

+    "+r"(width)           // %1

+  : "r"(scale),           // %2

+    "r"(interval_size),   // %3

+    "r"(interval_offset)  // %4

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"

+  );

+}

+#endif  // HAS_ARGBQUANTIZEROW_NEON

+// Shade 8 pixels at a time by specified value.

+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.

+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.

+#ifdef HAS_ARGBSHADEROW_NEON

+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,

+                       uint32 value) {

+  asm volatile (

+    "vdup.u32   q0, %3                         \n"  // duplicate scale value.

+    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.

+    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)

+    "vmovl.u8   q11, d22                       \n"

+    "vmovl.u8   q12, d24                       \n"

+    "vmovl.u8   q13, d26                       \n"

+    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2

+    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g

+    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r

+    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a

+    "vqmovn.u16 d20, q10                       \n"

+    "vqmovn.u16 d22, q11                       \n"

+    "vqmovn.u16 d24, q12                       \n"

+    "vqmovn.u16 d26, q13                       \n"

+    MEMACCESS(1)

+    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),       // %0

+    "+r"(dst_argb),       // %1

+    "+r"(width)           // %2

+  : "r"(value)            // %3

+  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"

+  );

+}

+#endif  // HAS_ARGBSHADEROW_NEON

+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels

+// Similar to ARGBToYJ but stores ARGB.

+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;

+#ifdef HAS_ARGBGRAYROW_NEON

+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {

+  asm volatile (

+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient

+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient

+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B

+    "vmov       d1, d0                         \n"  // G

+    "vmov       d2, d0                         \n"  // R

+    MEMACCESS(1)

+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(width)      // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

+  );

+}

+#endif  // HAS_ARGBGRAYROW_NEON

+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.

+//    b = (r * 35 + g * 68 + b * 17) >> 7

+//    g = (r * 45 + g * 88 + b * 22) >> 7

+//    r = (r * 50 + g * 98 + b * 24) >> 7

+#ifdef HAS_ARGBSEPIAROW_NEON

+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {

+  asm volatile (

+    "vmov.u8    d20, #17                       \n"  // BB coefficient

+    "vmov.u8    d21, #68                       \n"  // BG coefficient

+    "vmov.u8    d22, #35                       \n"  // BR coefficient

+    "vmov.u8    d24, #22                       \n"  // GB coefficient

+    "vmov.u8    d25, #88                       \n"  // GG coefficient

+    "vmov.u8    d26, #45                       \n"  // GR coefficient

+    "vmov.u8    d28, #24                       \n"  // BB coefficient

+    "vmov.u8    d29, #98                       \n"  // BG coefficient

+    "vmov.u8    d30, #50                       \n"  // BR coefficient

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.

+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B

+    "vmlal.u8   q2, d1, d21                    \n"  // G

+    "vmlal.u8   q2, d2, d22                    \n"  // R

+    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G

+    "vmlal.u8   q3, d1, d25                    \n"  // G

+    "vmlal.u8   q3, d2, d26                    \n"  // R

+    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R

+    "vmlal.u8   q8, d1, d29                    \n"  // G

+    "vmlal.u8   q8, d2, d30                    \n"  // R

+    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B

+    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G

+    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R

+    MEMACCESS(0)

+    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(dst_argb),  // %0

+    "+r"(width)      // %1

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3",

+    "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_ARGBSEPIAROW_NEON

+// Tranform 8 ARGB pixels (32 bytes) with color matrix.

+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function

+// needs to saturate.  Consider doing a non-saturating version.

+#ifdef HAS_ARGBCOLORMATRIXROW_NEON

+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,

+                             const int8* matrix_argb, int width) {

+  asm volatile (

+    MEMACCESS(3)

+    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.

+    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.

+    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit

+    "vmovl.u8   q9, d18                        \n"  // g

+    "vmovl.u8   q10, d20                       \n"  // r

+    "vmovl.u8   q15, d22                       \n"  // a

+    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B

+    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G

+    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R

+    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A

+    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B

+    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G

+    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R

+    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A

+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

+    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B

+    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G

+    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R

+    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A

+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

+    "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B

+    "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G

+    "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R

+    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A

+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

+    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B

+    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G

+    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R

+    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A

+    MEMACCESS(1)

+    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_argb),   // %1

+    "+r"(width)       // %2

+  : "r"(matrix_argb)  // %3

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",

+    "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#endif  // HAS_ARGBCOLORMATRIXROW_NEON

+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.

+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.

+#ifdef HAS_ARGBMULTIPLYROW_NEON

+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

+                          uint8* dst_argb, int width) {

+  asm volatile (

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(1)

+    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q0, d0, d1                     \n"  // multiply B

+    "vmull.u8   q1, d2, d3                     \n"  // multiply G

+    "vmull.u8   q2, d4, d5                     \n"  // multiply R

+    "vmull.u8   q3, d6, d7                     \n"  // multiply A

+    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B

+    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G

+    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R

+    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A

+    MEMACCESS(2)

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_argb0),  // %0

+    "+r"(src_argb1),  // %1

+    "+r"(dst_argb),   // %2

+    "+r"(width)       // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"

+  );

+}

+#endif  // HAS_ARGBMULTIPLYROW_NEON

+// Add 2 rows of ARGB pixels together, 8 pixels at a time.

+#ifdef HAS_ARGBADDROW_NEON

+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

+                     uint8* dst_argb, int width) {

+  asm volatile (

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(1)

+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vqadd.u8   q0, q0, q2                     \n"  // add B, G

+    "vqadd.u8   q1, q1, q3                     \n"  // add R, A

+    MEMACCESS(2)

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_argb0),  // %0

+    "+r"(src_argb1),  // %1

+    "+r"(dst_argb),   // %2

+    "+r"(width)       // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"

+  );

+}

+#endif  // HAS_ARGBADDROW_NEON

+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.

+#ifdef HAS_ARGBSUBTRACTROW_NEON

+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

+                          uint8* dst_argb, int width) {

+  asm volatile (

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(1)

+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G

+    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A

+    MEMACCESS(2)

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_argb0),  // %0

+    "+r"(src_argb1),  // %1

+    "+r"(dst_argb),   // %2

+    "+r"(width)       // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"

+  );

+}

+#endif  // HAS_ARGBSUBTRACTROW_NEON

+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.

+// A = 255

+// R = Sobel

+// G = Sobel

+// B = Sobel

+#ifdef HAS_SOBELROW_NEON

+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

+                     uint8* dst_argb, int width) {

+  asm volatile (

+    "vmov.u8    d3, #255                       \n"  // alpha

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.

+    MEMACCESS(1)

+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vqadd.u8   d0, d0, d1                     \n"  // add

+    "vmov.u8    d1, d0                         \n"

+    "vmov.u8    d2, d0                         \n"

+    MEMACCESS(2)

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_sobelx),  // %0

+    "+r"(src_sobely),  // %1

+    "+r"(dst_argb),    // %2

+    "+r"(width)        // %3

+  :

+  : "cc", "memory", "q0", "q1"

+  );

+}

+#endif  // HAS_SOBELROW_NEON

+// Adds Sobel X and Sobel Y and stores Sobel into plane.

+#ifdef HAS_SOBELTOPLANEROW_NEON

+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

+                          uint8* dst_y, int width) {

+  asm volatile (

+    // 16 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.

+    MEMACCESS(1)

+    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.

+    "vqadd.u8   q0, q0, q1                     \n"  // add

+    MEMACCESS(2)

+    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_sobelx),  // %0

+    "+r"(src_sobely),  // %1

+    "+r"(dst_y),       // %2

+    "+r"(width)        // %3

+  :

+  : "cc", "memory", "q0", "q1"

+  );

+}

+#endif  // HAS_SOBELTOPLANEROW_NEON

+// Mixes Sobel X, Sobel Y and Sobel into ARGB.

+// A = 255

+// R = Sobel X

+// G = Sobel

+// B = Sobel Y

+#ifdef HAS_SOBELXYROW_NEON

+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

+                     uint8* dst_argb, int width) {

+  asm volatile (

+    "vmov.u8    d3, #255                       \n"  // alpha

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.

+    MEMACCESS(1)

+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vqadd.u8   d1, d0, d2                     \n"  // add

+    MEMACCESS(2)

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_sobelx),  // %0

+    "+r"(src_sobely),  // %1

+    "+r"(dst_argb),    // %2

+    "+r"(width)        // %3

+  :

+  : "cc", "memory", "q0", "q1"

+  );

+}

+#endif  // HAS_SOBELXYROW_NEON

+// SobelX as a matrix is

+// -1  0  1

+// -2  0  2

+// -1  0  1

+#ifdef HAS_SOBELXROW_NEON

+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,

+                    const uint8* src_y2, uint8* dst_sobelx, int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {d0}, [%0],%5                  \n"  // top

+    MEMACCESS(0)

+    "vld1.8     {d1}, [%0],%6                  \n"

+    "vsubl.u8   q0, d0, d1                     \n"

+    MEMACCESS(1)

+    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2

+    MEMACCESS(1)

+    "vld1.8     {d3}, [%1],%6                  \n"

+    "vsubl.u8   q1, d2, d3                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    MEMACCESS(2)

+    "vld1.8     {d2}, [%2],%5                  \n"  // bottom

+    MEMACCESS(2)

+    "vld1.8     {d3}, [%2],%6                  \n"

+    "subs       %4, %4, #8                     \n"  // 8 pixels

+    "vsubl.u8   q1, d2, d3                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    "vabs.s16   q0, q0                         \n"

+    "vqmovn.u16 d0, q0                         \n"

+    MEMACCESS(3)

+    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx

+    "bgt        1b                             \n"

+  : "+r"(src_y0),      // %0

+    "+r"(src_y1),      // %1

+    "+r"(src_y2),      // %2

+    "+r"(dst_sobelx),  // %3

+    "+r"(width)        // %4

+  : "r"(2),            // %5

+    "r"(6)             // %6

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+#endif  // HAS_SOBELXROW_NEON

+// SobelY as a matrix is

+// -1 -2 -1

+//  0  0  0

+//  1  2  1

+#ifdef HAS_SOBELYROW_NEON

+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,

+                    uint8* dst_sobely, int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    MEMACCESS(0)

+    "vld1.8     {d0}, [%0],%4                  \n"  // left

+    MEMACCESS(1)

+    "vld1.8     {d1}, [%1],%4                  \n"

+    "vsubl.u8   q0, d0, d1                     \n"

+    MEMACCESS(0)

+    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2

+    MEMACCESS(1)

+    "vld1.8     {d3}, [%1],%4                  \n"

+    "vsubl.u8   q1, d2, d3                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    MEMACCESS(0)

+    "vld1.8     {d2}, [%0],%5                  \n"  // right

+    MEMACCESS(1)

+    "vld1.8     {d3}, [%1],%5                  \n"

+    "subs       %3, %3, #8                     \n"  // 8 pixels

+    "vsubl.u8   q1, d2, d3                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    "vabs.s16   q0, q0                         \n"

+    "vqmovn.u16 d0, q0                         \n"

+    MEMACCESS(2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely

+    "bgt        1b                             \n"

+  : "+r"(src_y0),      // %0

+    "+r"(src_y1),      // %1

+    "+r"(dst_sobely),  // %2

+    "+r"(width)        // %3

+  : "r"(1),            // %4

+    "r"(6)             // %5

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+#endif  // HAS_SOBELYROW_NEON

+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- a/third_party/libyuv/source/row_posix.cc

+++ b/third_party/libyuv/source/row_posix.cc

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

--- a/third_party/libyuv/source/row_win.cc

+++ b/third_party/libyuv/source/row_win.cc

@@ -8,16 +8,180 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/row.h"

+#if defined (_M_X64)

+#include <emmintrin.h>

+#include <tmmintrin.h>  // For _mm_maddubs_epi16

+#endif

 #ifdef __cplusplus

 namespace libyuv {

 extern "C" {

 #endif

-// This module is for Visual C x86.

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

+// This module is for Visual C.

+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)

+#define YG 74  /* (int8)(1.164 * 64 + 0.5) */

+#define UB 127  /* min(127,(int8)(2.018 * 64)) */

+#define UG -25  /* (int8)(-0.391 * 64 - 0.5) */

+#define UR 0

+#define VB 0

+#define VG -52  /* (int8)(-0.813 * 64 - 0.5) */

+#define VR 102  /* (int8)(1.596 * 64 + 0.5) */

+// Bias

+#define BB UB * 128 + VB * 128

+#define BG UG * 128 + VG * 128

+#define BR UR * 128 + VR * 128

+static const vec8 kUVToB = {

+  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB

+};

+static const vec8 kUVToR = {

+  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR

+};

+static const vec8 kUVToG = {

+  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG

+};

+static const vec8 kVUToB = {

+  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,

+};

+static const vec8 kVUToR = {

+  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,

+};

+static const vec8 kVUToG = {

+  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

+};

+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };

+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };

+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };

+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };

+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };

+// 64 bit

+#if defined(_M_X64)

+// Aligned destination version.

+__declspec(align(16))

+void I422ToARGBRow_SSSE3(const uint8* y_buf,

+                         const uint8* u_buf,

+                         const uint8* v_buf,

+                         uint8* dst_argb,

+                         int width) {

+  __m128i xmm0, xmm1, xmm2, xmm3;

+  const __m128i xmm5 = _mm_set1_epi8(-1);

+  const __m128i xmm4 = _mm_setzero_si128();

+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;

+  while (width > 0) {

+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);

+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));

+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);

+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);

+    xmm1 = _mm_load_si128(&xmm0);

+    xmm2 = _mm_load_si128(&xmm0);

+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);

+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);

+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);

+    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);

+    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);

+    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);

+    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);

+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);

+    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);

+    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);

+    xmm0 = _mm_adds_epi16(xmm0, xmm3);

+    xmm1 = _mm_adds_epi16(xmm1, xmm3);

+    xmm2 = _mm_adds_epi16(xmm2, xmm3);

+    xmm0 = _mm_srai_epi16(xmm0, 6);

+    xmm1 = _mm_srai_epi16(xmm1, 6);

+    xmm2 = _mm_srai_epi16(xmm2, 6);

+    xmm0 = _mm_packus_epi16(xmm0, xmm0);

+    xmm1 = _mm_packus_epi16(xmm1, xmm1);

+    xmm2 = _mm_packus_epi16(xmm2, xmm2);

+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);

+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);

+    xmm1 = _mm_load_si128(&xmm0);

+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);

+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);

+    _mm_store_si128((__m128i *)dst_argb, xmm0);

+    _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);

+    y_buf += 8;

+    u_buf += 4;

+    dst_argb += 32;

+    width -= 8;

+  }

+}

+// Unaligned destination version.

+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                   const uint8* u_buf,

+                                   const uint8* v_buf,

+                                   uint8* dst_argb,

+                                   int width) {

+  __m128i xmm0, xmm1, xmm2, xmm3;

+  const __m128i xmm5 = _mm_set1_epi8(-1);

+  const __m128i xmm4 = _mm_setzero_si128();

+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;

+  while (width > 0) {

+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);

+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));

+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);

+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);

+    xmm1 = _mm_load_si128(&xmm0);

+    xmm2 = _mm_load_si128(&xmm0);

+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);

+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);

+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);

+    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);

+    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);

+    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);

+    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);

+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);

+    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);

+    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);

+    xmm0 = _mm_adds_epi16(xmm0, xmm3);

+    xmm1 = _mm_adds_epi16(xmm1, xmm3);

+    xmm2 = _mm_adds_epi16(xmm2, xmm3);

+    xmm0 = _mm_srai_epi16(xmm0, 6);

+    xmm1 = _mm_srai_epi16(xmm1, 6);

+    xmm2 = _mm_srai_epi16(xmm2, 6);

+    xmm0 = _mm_packus_epi16(xmm0, xmm0);

+    xmm1 = _mm_packus_epi16(xmm1, xmm1);

+    xmm2 = _mm_packus_epi16(xmm2, xmm2);

+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);

+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);

+    xmm1 = _mm_load_si128(&xmm0);

+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);

+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);

+    _mm_storeu_si128((__m128i *)dst_argb, xmm0);

+    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);

+    y_buf += 8;

+    u_buf += 4;

+    dst_argb += 32;

+    width -= 8;

+  }

+}

+// 32 bit

+#else  // defined(_M_X64)

 #ifdef HAS_ARGBTOYROW_SSSE3

 // Constants for ARGB.

@@ -2030,21 +2194,6 @@

 #endif  // HAS_ARGBTOYROW_SSSE3

-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */

-#define UB 127 /* min(63,(int8)(2.018 * 64)) */

-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */

-#define UR 0

-#define VB 0

-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */

-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */

-// Bias

-#define BB UB * 128 + VB * 128

-#define BG UG * 128 + VG * 128

-#define BR UR * 128 + VR * 128

 #ifdef HAS_I422TOARGBROW_AVX2

 static const lvec8 kUVToB_AVX = {

@@ -2079,10 +2228,10 @@

 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

 __declspec(naked) __declspec(align(16))

 void I422ToARGBRow_AVX2(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* dst_argb,

-                         int width) {

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* dst_argb,

+                        int width) {

   __asm {

     push       esi

     push       edi

@@ -2150,36 +2299,6 @@

 #ifdef HAS_I422TOARGBROW_SSSE3

-static const vec8 kUVToB = {

-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB

-};

-static const vec8 kUVToR = {

-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR

-};

-static const vec8 kUVToG = {

-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG

-};

-static const vec8 kVUToB = {

-  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,

-};

-static const vec8 kVUToR = {

-  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,

-};

-static const vec8 kVUToG = {

-  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

-};

-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };

-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };

-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };

-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };

-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };

 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.

 // Read 8 UV from 444.

@@ -7276,7 +7395,8 @@

 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3

-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

+#endif  // defined(_M_X64)

+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)

 #ifdef __cplusplus

 }  // extern "C"

--- /dev/null

+++ b/third_party/libyuv/source/row_x86.asm

@@ -1,0 +1,146 @@

+;

+; Copyright 2012 The LibYuv Project Authors. All rights reserved.

+;

+; Use of this source code is governed by a BSD-style license

+; that can be found in the LICENSE file in the root of the source

+; tree. An additional intellectual property rights grant can be found

+; in the file PATENTS. All contributing project authors may

+; be found in the AUTHORS file in the root of the source tree.

+;

+%ifdef __YASM_VERSION_ID__

+%if __YASM_VERSION_ID__ < 01020000h

+%error AVX2 is supported only by yasm 1.2.0 or later.

+%endif

+%endif

+%include "x86inc.asm"

+SECTION .text

+; cglobal numeric constants are parameters, gpr regs, mm regs

+; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)

+%macro YUY2TOYROW 2-3

+cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix

+%ifidn %1,YUY2

+    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff

+    psrlw      m2, m2, 8

+%endif

+    ALIGN      4

+.convertloop:

+    mov%2      m0, [src_yuy2q]

+    mov%2      m1, [src_yuy2q + mmsize]

+    lea        src_yuy2q, [src_yuy2q + mmsize * 2]

+%ifidn %1,YUY2

+    pand       m0, m0, m2   ; YUY2 even bytes are Y

+    pand       m1, m1, m2

+%else

+    psrlw      m0, m0, 8    ; UYVY odd bytes are Y

+    psrlw      m1, m1, 8

+%endif

+    packuswb   m0, m0, m1

+%if cpuflag(AVX2)

+    vpermq     m0, m0, 0xd8

+%endif

+    sub        pixd, mmsize

+    mov%2      [dst_yq], m0

+    lea        dst_yq, [dst_yq + mmsize]

+    jg         .convertloop

+    REP_RET

+%endmacro

+; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.

+INIT_MMX MMX

+YUY2TOYROW YUY2,a,

+YUY2TOYROW YUY2,u,_Unaligned

+YUY2TOYROW UYVY,a,

+YUY2TOYROW UYVY,u,_Unaligned

+INIT_XMM SSE2

+YUY2TOYROW YUY2,a,

+YUY2TOYROW YUY2,u,_Unaligned

+YUY2TOYROW UYVY,a,

+YUY2TOYROW UYVY,u,_Unaligned

+INIT_YMM AVX2

+YUY2TOYROW YUY2,a,

+YUY2TOYROW UYVY,a,

+; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)

+%macro SplitUVRow 1-2

+cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix

+    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff

+    psrlw      m4, m4, 8

+    sub        dst_vq, dst_uq

+    ALIGN      4

+.convertloop:

+    mov%1      m0, [src_uvq]

+    mov%1      m1, [src_uvq + mmsize]

+    lea        src_uvq, [src_uvq + mmsize * 2]

+    psrlw      m2, m0, 8         ; odd bytes

+    psrlw      m3, m1, 8

+    pand       m0, m0, m4        ; even bytes

+    pand       m1, m1, m4

+    packuswb   m0, m0, m1

+    packuswb   m2, m2, m3

+%if cpuflag(AVX2)

+    vpermq     m0, m0, 0xd8

+    vpermq     m2, m2, 0xd8

+%endif

+    mov%1      [dst_uq], m0

+    mov%1      [dst_uq + dst_vq], m2

+    lea        dst_uq, [dst_uq + mmsize]

+    sub        pixd, mmsize

+    jg         .convertloop

+    REP_RET

+%endmacro

+INIT_MMX MMX

+SplitUVRow a,

+SplitUVRow u,_Unaligned

+INIT_XMM SSE2

+SplitUVRow a,

+SplitUVRow u,_Unaligned

+INIT_YMM AVX2

+SplitUVRow a,

+; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+;                      int width);

+%macro MergeUVRow_ 1-2

+cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix

+    sub        src_vq, src_uq

+    ALIGN      4

+.convertloop:

+    mov%1      m0, [src_uq]

+    mov%1      m1, [src_vq]

+    lea        src_uq, [src_uq + mmsize]

+    punpcklbw  m2, m0, m1       // first 8 UV pairs

+    punpckhbw  m0, m0, m1       // next 8 UV pairs

+%if cpuflag(AVX2)

+    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0

+    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0

+    mov%1      [dst_uvq], m1

+    mov%1      [dst_uvq + mmsize], m2

+%else

+    mov%1      [dst_uvq], m2

+    mov%1      [dst_uvq + mmsize], m0

+%endif

+    lea        dst_uvq, [dst_uvq + mmsize * 2]

+    sub        pixd, mmsize

+    jg         .convertloop

+    REP_RET

+%endmacro

+INIT_MMX MMX

+MergeUVRow_ a,

+MergeUVRow_ u,_Unaligned

+INIT_XMM SSE2

+MergeUVRow_ a,

+MergeUVRow_ u,_Unaligned

+INIT_YMM AVX2

+MergeUVRow_ a,

--- a/third_party/libyuv/source/scale.cc

+++ b/third_party/libyuv/source/scale.cc

@@ -8,15 +8,15 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/scale.h"

+#include "libyuv/scale.h"

 #include <assert.h>

 #include <string.h>

-#include "third_party/libyuv/include/libyuv/cpu_id.h"

-#include "third_party/libyuv/include/libyuv/planar_functions.h"  // CopyPlane

-#include "third_party/libyuv/include/libyuv/row.h"

-#include "third_party/libyuv/include/libyuv/scale_row.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/planar_functions.h"  // For CopyPlane

+#include "libyuv/row.h"

+#include "libyuv/scale_row.h"

 #ifdef __cplusplus

 namespace libyuv {

--- /dev/null

+++ b/third_party/libyuv/source/scale_argb.cc

@@ -1,0 +1,809 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/scale.h"

+#include <assert.h>

+#include <string.h>

+#include "libyuv/cpu_id.h"

+#include "libyuv/planar_functions.h"  // For CopyARGB

+#include "libyuv/row.h"

+#include "libyuv/scale_row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+static __inline int Abs(int v) {

+  return v >= 0 ? v : -v;

+}

+// ScaleARGB ARGB, 1/2

+// This is an optimized version for scaling down a ARGB to 1/2 of

+// its original size.

+static void ScaleARGBDown2(int src_width, int src_height,

+                           int dst_width, int dst_height,

+                           int src_stride, int dst_stride,

+                           const uint8* src_argb, uint8* dst_argb,

+                           int x, int dx, int y, int dy,

+                           enum FilterMode filtering) {

+  int j;

+  int row_stride = src_stride * (dy >> 16);

+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,

+                            uint8* dst_argb, int dst_width) =

+    filtering == kFilterNone ? ScaleARGBRowDown2_C :

+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :

+        ScaleARGBRowDown2Box_C);

+  assert(dx == 65536 * 2);  // Test scale factor of 2.

+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.

+  // Advance to odd row, even column.

+  if (filtering == kFilterBilinear) {

+    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;

+  } else {

+    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;

+  }

+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :

+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :

+        ScaleARGBRowDown2Box_SSE2);

+  }

+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&

+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {

+    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :

+        ScaleARGBRowDown2_NEON;

+  }

+#endif

+  if (filtering == kFilterLinear) {

+    src_stride = 0;

+  }

+  for (j = 0; j < dst_height; ++j) {

+    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);

+    src_argb += row_stride;

+    dst_argb += dst_stride;

+  }

+}

+// ScaleARGB ARGB, 1/4

+// This is an optimized version for scaling down a ARGB to 1/4 of

+// its original size.

+static void ScaleARGBDown4Box(int src_width, int src_height,

+                              int dst_width, int dst_height,

+                              int src_stride, int dst_stride,

+                              const uint8* src_argb, uint8* dst_argb,

+                              int x, int dx, int y, int dy) {

+  int j;

+  // Allocate 2 rows of ARGB.

+  const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;

+  align_buffer_64(row, kRowSize * 2);

+  int row_stride = src_stride * (dy >> 16);

+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,

+    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;

+  // Advance to odd row, even column.

+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;

+  assert(dx == 65536 * 4);  // Test scale factor of 4.

+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.

+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;

+  }

+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&

+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {

+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;

+  }

+#endif

+  for (j = 0; j < dst_height; ++j) {

+    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);

+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,

+                      row + kRowSize, dst_width * 2);

+    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);

+    src_argb += row_stride;

+    dst_argb += dst_stride;

+  }

+  free_aligned_buffer_64(row);

+}

+// ScaleARGB ARGB Even

+// This is an optimized version for scaling down a ARGB to even

+// multiple of its original size.

+static void ScaleARGBDownEven(int src_width, int src_height,

+                              int dst_width, int dst_height,

+                              int src_stride, int dst_stride,

+                              const uint8* src_argb, uint8* dst_argb,

+                              int x, int dx, int y, int dy,

+                              enum FilterMode filtering) {

+  int j;

+  int col_step = dx >> 16;

+  int row_stride = (dy >> 16) * src_stride;

+  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,

+                               int src_step, uint8* dst_argb, int dst_width) =

+      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;

+  assert(IS_ALIGNED(src_width, 2));

+  assert(IS_ALIGNED(src_height, 2));

+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;

+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :

+        ScaleARGBRowDownEven_SSE2;

+  }

+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&

+      IS_ALIGNED(src_argb, 4)) {

+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :

+        ScaleARGBRowDownEven_NEON;

+  }

+#endif

+  if (filtering == kFilterLinear) {

+    src_stride = 0;

+  }

+  for (j = 0; j < dst_height; ++j) {

+    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);

+    src_argb += row_stride;

+    dst_argb += dst_stride;

+  }

+}

+// Scale ARGB down with bilinear interpolation.

+static void ScaleARGBBilinearDown(int src_width, int src_height,

+                                  int dst_width, int dst_height,

+                                  int src_stride, int dst_stride,

+                                  const uint8* src_argb, uint8* dst_argb,

+                                  int x, int dx, int y, int dy,

+                                  enum FilterMode filtering) {

+  int j;

+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,

+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

+      InterpolateRow_C;

+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,

+      int dst_width, int x, int dx) =

+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;

+  int64 xlast = x + (int64)(dst_width - 1) * dx;

+  int64 xl = (dx >= 0) ? x : xlast;

+  int64 xr = (dx >= 0) ? xlast : x;

+  int clip_src_width;

+  xl = (xl >> 16) & ~3;  // Left edge aligned.

+  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.

+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.

+  if (xr > src_width) {

+    xr = src_width;

+  }

+  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.

+  src_argb += xl * 4;

+  x -= (int)(xl << 16);

+#if defined(HAS_INTERPOLATEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_SSE2;

+    if (IS_ALIGNED(clip_src_width, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {

+        InterpolateRow = InterpolateRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_SSSE3;

+    if (IS_ALIGNED(clip_src_width, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {

+        InterpolateRow = InterpolateRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {

+    InterpolateRow = InterpolateRow_Any_AVX2;

+    if (IS_ALIGNED(clip_src_width, 32)) {

+      InterpolateRow = InterpolateRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_NEON;

+    if (IS_ALIGNED(clip_src_width, 16)) {

+      InterpolateRow = InterpolateRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&

+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {

+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;

+    if (IS_ALIGNED(clip_src_width, 4)) {

+      InterpolateRow = InterpolateRow_MIPS_DSPR2;

+    }

+  }

+#endif

+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;

+  }

+#endif

+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.

+  // Allocate a row of ARGB.

+  {

+    align_buffer_64(row, clip_src_width * 4);

+    const int max_y = (src_height - 1) << 16;

+    if (y > max_y) {

+      y = max_y;

+    }

+    for (j = 0; j < dst_height; ++j) {

+      int yi = y >> 16;

+      const uint8* src = src_argb + yi * src_stride;

+      if (filtering == kFilterLinear) {

+        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);

+      } else {

+        int yf = (y >> 8) & 255;

+        InterpolateRow(row, src, src_stride, clip_src_width, yf);

+        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);

+      }

+      dst_argb += dst_stride;

+      y += dy;

+      if (y > max_y) {

+        y = max_y;

+      }

+    }

+    free_aligned_buffer_64(row);

+  }

+}

+// Scale ARGB up with bilinear interpolation.

+static void ScaleARGBBilinearUp(int src_width, int src_height,

+                                int dst_width, int dst_height,

+                                int src_stride, int dst_stride,

+                                const uint8* src_argb, uint8* dst_argb,

+                                int x, int dx, int y, int dy,

+                                enum FilterMode filtering) {

+  int j;

+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,

+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

+      InterpolateRow_C;

+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,

+      int dst_width, int x, int dx) =

+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;

+  const int max_y = (src_height - 1) << 16;

+#if defined(HAS_INTERPOLATEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {

+    InterpolateRow = InterpolateRow_Any_SSE2;

+    if (IS_ALIGNED(dst_width, 4)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSE2;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+        InterpolateRow = InterpolateRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {

+    InterpolateRow = InterpolateRow_Any_SSSE3;

+    if (IS_ALIGNED(dst_width, 4)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+        InterpolateRow = InterpolateRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {

+    InterpolateRow = InterpolateRow_Any_AVX2;

+    if (IS_ALIGNED(dst_width, 8)) {

+      InterpolateRow = InterpolateRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {

+    InterpolateRow = InterpolateRow_Any_NEON;

+    if (IS_ALIGNED(dst_width, 4)) {

+      InterpolateRow = InterpolateRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&

+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {

+    InterpolateRow = InterpolateRow_MIPS_DSPR2;

+  }

+#endif

+  if (src_width >= 32768) {

+    ScaleARGBFilterCols = filtering ?

+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;

+  }

+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)

+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;

+  }

+#endif

+#if defined(HAS_SCALEARGBCOLS_SSE2)

+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {

+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;

+  }

+#endif

+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {

+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;

+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)

+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&

+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&

+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;

+    }

+#endif

+  }

+  if (y > max_y) {

+    y = max_y;

+  }

+  {

+    int yi = y >> 16;

+    const uint8* src = src_argb + yi * src_stride;

+    // Allocate 2 rows of ARGB.

+    const int kRowSize = (dst_width * 4 + 15) & ~15;

+    align_buffer_64(row, kRowSize * 2);

+    uint8* rowptr = row;

+    int rowstride = kRowSize;

+    int lasty = yi;

+    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);

+    if (src_height > 1) {

+      src += src_stride;

+    }

+    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);

+    src += src_stride;

+    for (j = 0; j < dst_height; ++j) {

+      yi = y >> 16;

+      if (yi != lasty) {

+        if (y > max_y) {

+          y = max_y;

+          yi = y >> 16;

+          src = src_argb + yi * src_stride;

+        }

+        if (yi != lasty) {

+          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);

+          rowptr += rowstride;

+          rowstride = -rowstride;

+          lasty = yi;

+          src += src_stride;

+        }

+      }

+      if (filtering == kFilterLinear) {

+        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);

+      } else {

+        int yf = (y >> 8) & 255;

+        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);

+      }

+      dst_argb += dst_stride;

+      y += dy;

+    }

+    free_aligned_buffer_64(row);

+  }

+}

+#ifdef YUVSCALEUP

+// Scale YUV to ARGB up with bilinear interpolation.

+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,

+                                     int dst_width, int dst_height,

+                                     int src_stride_y,

+                                     int src_stride_u,

+                                     int src_stride_v,

+                                     int dst_stride_argb,

+                                     const uint8* src_y,

+                                     const uint8* src_u,

+                                     const uint8* src_v,

+                                     uint8* dst_argb,

+                                     int x, int dx, int y, int dy,

+                                     enum FilterMode filtering) {

+  int j;

+  void (*I422ToARGBRow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I422ToARGBRow_C;

+#if defined(HAS_I422TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {

+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;

+    if (IS_ALIGNED(src_width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        I422ToARGBRow = I422ToARGBRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {

+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;

+    if (IS_ALIGNED(src_width, 16)) {

+      I422ToARGBRow = I422ToARGBRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {

+    I422ToARGBRow = I422ToARGBRow_Any_NEON;

+    if (IS_ALIGNED(src_width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&

+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;

+  }

+#endif

+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,

+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

+      InterpolateRow_C;

+#if defined(HAS_INTERPOLATEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {

+    InterpolateRow = InterpolateRow_Any_SSE2;

+    if (IS_ALIGNED(dst_width, 4)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSE2;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        InterpolateRow = InterpolateRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {

+    InterpolateRow = InterpolateRow_Any_SSSE3;

+    if (IS_ALIGNED(dst_width, 4)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        InterpolateRow = InterpolateRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {

+    InterpolateRow = InterpolateRow_Any_AVX2;

+    if (IS_ALIGNED(dst_width, 8)) {

+      InterpolateRow = InterpolateRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {

+    InterpolateRow = InterpolateRow_Any_NEON;

+    if (IS_ALIGNED(dst_width, 4)) {

+      InterpolateRow = InterpolateRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&

+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

+    InterpolateRow = InterpolateRow_MIPS_DSPR2;

+  }

+#endif

+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,

+      int dst_width, int x, int dx) =

+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;

+  if (src_width >= 32768) {

+    ScaleARGBFilterCols = filtering ?

+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;

+  }

+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)

+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;

+  }

+#endif

+#if defined(HAS_SCALEARGBCOLS_SSE2)

+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {

+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;

+  }

+#endif

+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {

+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;

+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)

+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&

+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&

+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;

+    }

+#endif

+  }

+  const int max_y = (src_height - 1) << 16;

+  if (y > max_y) {

+    y = max_y;

+  }

+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.

+  int yi = y >> 16;

+  int uv_yi = yi >> kYShift;

+  const uint8* src_row_y = src_y + yi * src_stride_y;

+  const uint8* src_row_u = src_u + uv_yi * src_stride_u;

+  const uint8* src_row_v = src_v + uv_yi * src_stride_v;

+  // Allocate 2 rows of ARGB.

+  const int kRowSize = (dst_width * 4 + 15) & ~15;

+  align_buffer_64(row, kRowSize * 2);

+  // Allocate 1 row of ARGB for source conversion.

+  align_buffer_64(argb_row, src_width * 4);

+  uint8* rowptr = row;

+  int rowstride = kRowSize;

+  int lasty = yi;

+  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.

+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);

+  if (src_height > 1) {

+    src_row_y += src_stride_y;

+    if (yi & 1) {

+      src_row_u += src_stride_u;

+      src_row_v += src_stride_v;

+    }

+  }

+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);

+  if (src_height > 2) {

+    src_row_y += src_stride_y;

+    if (!(yi & 1)) {

+      src_row_u += src_stride_u;

+      src_row_v += src_stride_v;

+    }

+  }

+  for (j = 0; j < dst_height; ++j) {

+    yi = y >> 16;

+    if (yi != lasty) {

+      if (y > max_y) {

+        y = max_y;

+        yi = y >> 16;

+        uv_yi = yi >> kYShift;

+        src_row_y = src_y + yi * src_stride_y;

+        src_row_u = src_u + uv_yi * src_stride_u;

+        src_row_v = src_v + uv_yi * src_stride_v;

+      }

+      if (yi != lasty) {

+        // TODO(fbarchard): Convert the clipped region of row.

+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);

+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);

+        rowptr += rowstride;

+        rowstride = -rowstride;

+        lasty = yi;

+        src_row_y += src_stride_y;

+        if (yi & 1) {

+          src_row_u += src_stride_u;

+          src_row_v += src_stride_v;

+        }

+      }

+    }

+    if (filtering == kFilterLinear) {

+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);

+    } else {

+      int yf = (y >> 8) & 255;

+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);

+    }

+    dst_argb += dst_stride_argb;

+    y += dy;

+  }

+  free_aligned_buffer_64(row);

+  free_aligned_buffer_64(row_argb);

+}

+#endif

+// Scale ARGB to/from any dimensions, without interpolation.

+// Fixed point math is used for performance: The upper 16 bits

+// of x and dx is the integer part of the source position and

+// the lower 16 bits are the fixed decimal part.

+static void ScaleARGBSimple(int src_width, int src_height,

+                            int dst_width, int dst_height,

+                            int src_stride, int dst_stride,

+                            const uint8* src_argb, uint8* dst_argb,

+                            int x, int dx, int y, int dy) {

+  int j;

+  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,

+      int dst_width, int x, int dx) =

+      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;

+#if defined(HAS_SCALEARGBCOLS_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {

+    ScaleARGBCols = ScaleARGBCols_SSE2;

+  }

+#endif

+  if (src_width * 2 == dst_width && x < 0x8000) {

+    ScaleARGBCols = ScaleARGBColsUp2_C;

+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)

+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&

+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&

+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+      ScaleARGBCols = ScaleARGBColsUp2_SSE2;

+    }

+#endif

+  }

+  for (j = 0; j < dst_height; ++j) {

+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,

+                  dst_width, x, dx);

+    dst_argb += dst_stride;

+    y += dy;

+  }

+}

+// ScaleARGB a ARGB.

+// This function in turn calls a scaling function

+// suitable for handling the desired resolutions.

+static void ScaleARGB(const uint8* src, int src_stride,

+                      int src_width, int src_height,

+                      uint8* dst, int dst_stride,

+                      int dst_width, int dst_height,

+                      int clip_x, int clip_y, int clip_width, int clip_height,

+                      enum FilterMode filtering) {

+  // Initial source x/y coordinate and step values as 16.16 fixed point.

+  int x = 0;

+  int y = 0;

+  int dx = 0;

+  int dy = 0;

+  // ARGB does not support box filter yet, but allow the user to pass it.

+  // Simplify filtering when possible.

+  filtering = ScaleFilterReduce(src_width, src_height,

+                                dst_width, dst_height,

+                                filtering);

+  // Negative src_height means invert the image.

+  if (src_height < 0) {

+    src_height = -src_height;

+    src = src + (src_height - 1) * src_stride;

+    src_stride = -src_stride;

+  }

+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,

+             &x, &y, &dx, &dy);

+  src_width = Abs(src_width);

+  if (clip_x) {

+    int64 clipf = (int64)(clip_x) * dx;

+    x += (clipf & 0xffff);

+    src += (clipf >> 16) * 4;

+    dst += clip_x * 4;

+  }

+  if (clip_y) {

+    int64 clipf = (int64)(clip_y) * dy;

+    y += (clipf & 0xffff);

+    src += (clipf >> 16) * src_stride;

+    dst += clip_y * dst_stride;

+  }

+  // Special case for integer step values.

+  if (((dx | dy) & 0xffff) == 0) {

+    if (!dx || !dy) {  // 1 pixel wide and/or tall.

+      filtering = kFilterNone;

+    } else {

+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.

+      if (!(dx & 0x10000) && !(dy & 0x10000)) {

+        if (dx == 0x20000) {

+          // Optimized 1/2 downsample.

+          ScaleARGBDown2(src_width, src_height,

+                         clip_width, clip_height,

+                         src_stride, dst_stride, src, dst,

+                         x, dx, y, dy, filtering);

+          return;

+        }

+        if (dx == 0x40000 && filtering == kFilterBox) {

+          // Optimized 1/4 box downsample.

+          ScaleARGBDown4Box(src_width, src_height,

+                            clip_width, clip_height,

+                            src_stride, dst_stride, src, dst,

+                            x, dx, y, dy);

+          return;

+        }

+        ScaleARGBDownEven(src_width, src_height,

+                          clip_width, clip_height,

+                          src_stride, dst_stride, src, dst,

+                          x, dx, y, dy, filtering);

+        return;

+      }

+      // Optimized odd scale down. ie 3, 5, 7, 9x.

+      if ((dx & 0x10000) && (dy & 0x10000)) {

+        filtering = kFilterNone;

+        if (dx == 0x10000 && dy == 0x10000) {

+          // Straight copy.

+          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,

+                   dst, dst_stride, clip_width, clip_height);

+          return;

+        }

+      }

+    }

+  }

+  if (dx == 0x10000 && (x & 0xffff) == 0) {

+    // Arbitrary scale vertically, but unscaled vertically.

+    ScalePlaneVertical(src_height,

+                       clip_width, clip_height,

+                       src_stride, dst_stride, src, dst,

+                       x, y, dy, 4, filtering);

+    return;

+  }

+  if (filtering && dy < 65536) {

+    ScaleARGBBilinearUp(src_width, src_height,

+                        clip_width, clip_height,

+                        src_stride, dst_stride, src, dst,

+                        x, dx, y, dy, filtering);

+    return;

+  }

+  if (filtering) {

+    ScaleARGBBilinearDown(src_width, src_height,

+                          clip_width, clip_height,

+                          src_stride, dst_stride, src, dst,

+                          x, dx, y, dy, filtering);

+    return;

+  }

+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,

+                  src_stride, dst_stride, src, dst,

+                  x, dx, y, dy);

+}

+LIBYUV_API

+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,

+                  int src_width, int src_height,

+                  uint8* dst_argb, int dst_stride_argb,

+                  int dst_width, int dst_height,

+                  int clip_x, int clip_y, int clip_width, int clip_height,

+                  enum FilterMode filtering) {

+  if (!src_argb || src_width == 0 || src_height == 0 ||

+      !dst_argb || dst_width <= 0 || dst_height <= 0 ||

+      clip_x < 0 || clip_y < 0 ||

+      (clip_x + clip_width) > dst_width ||

+      (clip_y + clip_height) > dst_height) {

+    return -1;

+  }

+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,

+            dst_argb, dst_stride_argb, dst_width, dst_height,

+            clip_x, clip_y, clip_width, clip_height, filtering);

+  return 0;

+}

+// Scale an ARGB image.

+LIBYUV_API

+int ARGBScale(const uint8* src_argb, int src_stride_argb,

+              int src_width, int src_height,

+              uint8* dst_argb, int dst_stride_argb,

+              int dst_width, int dst_height,

+              enum FilterMode filtering) {

+  if (!src_argb || src_width == 0 || src_height == 0 ||

+      !dst_argb || dst_width <= 0 || dst_height <= 0) {

+    return -1;

+  }

+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,

+            dst_argb, dst_stride_argb, dst_width, dst_height,

+            0, 0, dst_width, dst_height, filtering);

+  return 0;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- a/third_party/libyuv/source/scale_common.cc

+++ b/third_party/libyuv/source/scale_common.cc

@@ -8,15 +8,15 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/scale.h"

+#include "libyuv/scale.h"

 #include <assert.h>

 #include <string.h>

-#include "third_party/libyuv/include/libyuv/cpu_id.h"

-#include "third_party/libyuv/include/libyuv/planar_functions.h"  // CopyARGB

-#include "third_party/libyuv/include/libyuv/row.h"

-#include "third_party/libyuv/include/libyuv/scale_row.h"

+#include "libyuv/cpu_id.h"

+#include "libyuv/planar_functions.h"  // For CopyARGB

+#include "libyuv/row.h"

+#include "libyuv/scale_row.h"

 #ifdef __cplusplus

 namespace libyuv {

--- a/third_party/libyuv/source/scale_mips.cc

+++ b/third_party/libyuv/source/scale_mips.cc

@@ -8,8 +8,8 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/basic_types.h"

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/basic_types.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -18,7 +18,8 @@

 // This module is for GCC MIPS DSPR2

 #if !defined(LIBYUV_DISABLE_MIPS) && \

-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)

+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \

+    (_MIPS_SIM == _MIPS_SIM_ABI32)

 void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

                               uint8* dst, int dst_width) {

--- a/third_party/libyuv/source/scale_neon.cc

+++ b/third_party/libyuv/source/scale_neon.cc

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -28,8 +28,10 @@

     ".p2align   2                              \n"

   "1:                                          \n"

     // load even pixels into q0, odd into q1

+    MEMACCESS(0)

     "vld2.8     {q0, q1}, [%0]!                \n"

     "subs       %2, %2, #16                    \n"  // 16 processed per loop

+    MEMACCESS(1)

     "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels

     "bgt        1b                             \n"

   : "+r"(src_ptr),          // %0

@@ -48,7 +50,9 @@

     "add        %1, %0                         \n"

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc

+    MEMACCESS(1)

     "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc

     "subs       %3, %3, #16                    \n"  // 16 processed per loop

     "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent

@@ -57,6 +61,7 @@

     "vpadal.u8  q1, q3                         \n"

     "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack

     "vrshrn.u16 d1, q1, #2                     \n"

+    MEMACCESS(2)

     "vst1.8     {q0}, [%2]!                    \n"

     "bgt        1b                             \n"

   : "+r"(src_ptr),          // %0

@@ -73,8 +78,10 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0

     "subs       %2, %2, #8                     \n" // 8 processed per loop

+    MEMACCESS(1)

     "vst1.8     {d2}, [%1]!                    \n"

     "bgt        1b                             \n"

   : "+r"(src_ptr),          // %0

@@ -87,16 +94,20 @@

 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

                            uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "add        r4, %0, %3                     \n"

-    "add        r5, r4, %3                     \n"

-    "add        %3, r5, %3                     \n"

+  const uint8* src_ptr1 = src_ptr + src_stride;

+  const uint8* src_ptr2 = src_ptr + src_stride * 2;

+  const uint8* src_ptr3 = src_ptr + src_stride * 3;

+asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4

-    "vld1.8     {q1}, [r4]!                    \n"

-    "vld1.8     {q2}, [r5]!                    \n"

-    "vld1.8     {q3}, [%3]!                    \n"

+    MEMACCESS(3)

+    "vld1.8     {q1}, [%3]!                    \n"

+    MEMACCESS(4)

+    "vld1.8     {q2}, [%4]!                    \n"

+    MEMACCESS(5)

+    "vld1.8     {q3}, [%5]!                    \n"

     "subs       %2, %2, #4                     \n"

     "vpaddl.u8  q0, q0                         \n"

     "vpadal.u8  q0, q1                         \n"

@@ -105,13 +116,17 @@

     "vpaddl.u16 q0, q0                         \n"

     "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding

     "vmovn.u16  d0, q0                         \n"

+    MEMACCESS(1)

     "vst1.32    {d0[0]}, [%1]!                 \n"

     "bgt        1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width)         // %2

-  : "r"(src_stride)         // %3

-  : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"

+  : "+r"(src_ptr),   // %0

+    "+r"(dst_ptr),   // %1

+    "+r"(dst_width), // %2

+    "+r"(src_ptr1),  // %3

+    "+r"(src_ptr2),  // %4

+    "+r"(src_ptr3)   // %5

+  :

+  : "q0", "q1", "q2", "q3", "memory", "cc"

);

@@ -124,9 +139,11 @@

   asm volatile (

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0

     "subs       %2, %2, #24                  \n"

     "vmov       d2, d3                       \n" // order d0, d1, d2

+    MEMACCESS(1)

     "vst3.8     {d0, d1, d2}, [%1]!          \n"

     "bgt        1b                           \n"

   : "+r"(src_ptr),          // %0

@@ -145,7 +162,9 @@

     "add        %3, %0                         \n"

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0

+    MEMACCESS(3)

     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1

     "subs         %2, %2, #24                  \n"

@@ -182,6 +201,7 @@

     "vmlal.u8     q8, d3, d24                  \n"

     "vqrshrn.u16  d2, q8, #2                   \n"

+    MEMACCESS(1)

     "vst3.8       {d0, d1, d2}, [%1]!          \n"

     "bgt          1b                           \n"

@@ -202,7 +222,9 @@

     "add        %3, %0                         \n"

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0

+    MEMACCESS(3)

     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1

     "subs         %2, %2, #24                  \n"

     // average src line 0 with src line 1

@@ -222,6 +244,7 @@

     "vmlal.u8     q3, d3, d24                  \n"

     "vqrshrn.u16  d2, q3, #2                   \n"

+    MEMACCESS(1)

     "vst3.8       {d0, d1, d2}, [%1]!          \n"

     "bgt          1b                           \n"

   : "+r"(src_ptr),          // %0

@@ -250,14 +273,18 @@

                          ptrdiff_t src_stride,

                          uint8* dst_ptr, int dst_width) {

   asm volatile (

+    MEMACCESS(3)

     "vld1.8     {q3}, [%3]                     \n"

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"

     "subs       %2, %2, #12                    \n"

     "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"

     "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"

+    MEMACCESS(1)

     "vst1.8     {d4}, [%1]!                    \n"

+    MEMACCESS(1)

     "vst1.32    {d5[0]}, [%1]!                 \n"

     "bgt        1b                             \n"

   : "+r"(src_ptr),          // %0

@@ -272,11 +299,15 @@

 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,

                                       ptrdiff_t src_stride,

                                       uint8* dst_ptr, int dst_width) {

+  const uint8* src_ptr1 = src_ptr + src_stride * 2;

   asm volatile (

-    "vld1.16    {q13}, [%4]                    \n"

-    "vld1.8     {q14}, [%5]                    \n"

-    "vld1.8     {q15}, [%6]                    \n"

-    "add        r4, %0, %3, lsl #1             \n"

+    MEMACCESS(5)

+    "vld1.16    {q13}, [%5]                    \n"

+    MEMACCESS(6)

+    "vld1.8     {q14}, [%6]                    \n"

+    MEMACCESS(7)

+    "vld1.8     {q15}, [%7]                    \n"

     "add        %3, %0                         \n"

     ".p2align   2                              \n"

   "1:                                          \n"

@@ -285,9 +316,12 @@

     // d1 = 10 50 11 51 12 52 13 53

     // d2 = 20 60 21 61 22 62 23 63

     // d3 = 30 70 31 71 32 72 33 73

+    MEMACCESS(0)

     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"

+    MEMACCESS(3)

     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"

-    "vld4.8       {d16, d17, d18, d19}, [r4]!  \n"

+    MEMACCESS(4)

+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"

     "subs         %2, %2, #12                  \n"

     // Shuffle the input data around to get align the data

@@ -364,18 +398,20 @@

     "vtbl.u8      d3, {d0, d1, d2}, d28        \n"

     "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

+    MEMACCESS(1)

     "vst1.8       {d3}, [%1]!                  \n"

+    MEMACCESS(1)

     "vst1.32      {d4[0]}, [%1]!               \n"

     "bgt          1b                           \n"

   : "+r"(src_ptr),          // %0

     "+r"(dst_ptr),          // %1

     "+r"(dst_width),        // %2

-    "+r"(src_stride)        // %3

-  : "r"(&kMult38_Div6),     // %4

-    "r"(&kShuf38_2),        // %5

-    "r"(&kMult38_Div9)      // %6

-  : "r4", "q0", "q1", "q2", "q3", "q8", "q9",

-    "q13", "q14", "q15", "memory", "cc"

+    "+r"(src_stride),       // %3

+    "+r"(src_ptr1)          // %4

+  : "r"(&kMult38_Div6),     // %5

+    "r"(&kShuf38_2),        // %6

+    "r"(&kMult38_Div9)      // %7

+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"

);

@@ -384,7 +420,9 @@

                                ptrdiff_t src_stride,

                                uint8* dst_ptr, int dst_width) {

   asm volatile (

+    MEMACCESS(4)

     "vld1.16    {q13}, [%4]                    \n"

+    MEMACCESS(5)

     "vld1.8     {q14}, [%5]                    \n"

     "add        %3, %0                         \n"

     ".p2align   2                              \n"

@@ -394,7 +432,9 @@

     // d1 = 10 50 11 51 12 52 13 53

     // d2 = 20 60 21 61 22 62 23 63

     // d3 = 30 70 31 71 32 72 33 73

+    MEMACCESS(0)

     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"

+    MEMACCESS(3)

     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"

     "subs         %2, %2, #12                  \n"

@@ -461,7 +501,9 @@

     "vtbl.u8      d3, {d0, d1, d2}, d28        \n"

     "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

+    MEMACCESS(1)

     "vst1.8       {d3}, [%1]!                  \n"

+    MEMACCESS(1)

     "vst1.32      {d4[0]}, [%1]!               \n"

     "bgt          1b                           \n"

   : "+r"(src_ptr),       // %0

@@ -494,7 +536,9 @@

     "vdup.8       d4, %4                       \n"

     // General purpose row blend.

   "1:                                          \n"

+    MEMACCESS(1)

     "vld1.8       {q0}, [%1]!                  \n"

+    MEMACCESS(2)

     "vld1.8       {q1}, [%2]!                  \n"

     "subs         %3, %3, #16                  \n"

     "vmull.u8     q13, d0, d4                  \n"

@@ -503,6 +547,7 @@

     "vmlal.u8     q14, d3, d5                  \n"

     "vrshrn.u16   d0, q13, #8                  \n"

     "vrshrn.u16   d1, q14, #8                  \n"

+    MEMACCESS(0)

     "vst1.8       {q0}, [%0]!                  \n"

     "bgt          1b                           \n"

     "b            99f                          \n"

@@ -509,11 +554,14 @@

     // Blend 25 / 75.

   "25:                                         \n"

+    MEMACCESS(1)

     "vld1.8       {q0}, [%1]!                  \n"

+    MEMACCESS(2)

     "vld1.8       {q1}, [%2]!                  \n"

     "subs         %3, %3, #16                  \n"

     "vrhadd.u8    q0, q1                       \n"

     "vrhadd.u8    q0, q1                       \n"

+    MEMACCESS(0)

     "vst1.8       {q0}, [%0]!                  \n"

     "bgt          25b                          \n"

     "b            99f                          \n"

@@ -520,10 +568,13 @@

     // Blend 50 / 50.

   "50:                                         \n"

+    MEMACCESS(1)

     "vld1.8       {q0}, [%1]!                  \n"

+    MEMACCESS(2)

     "vld1.8       {q1}, [%2]!                  \n"

     "subs         %3, %3, #16                  \n"

     "vrhadd.u8    q0, q1                       \n"

+    MEMACCESS(0)

     "vst1.8       {q0}, [%0]!                  \n"

     "bgt          50b                          \n"

     "b            99f                          \n"

@@ -530,11 +581,14 @@

     // Blend 75 / 25.

   "75:                                         \n"

+    MEMACCESS(1)

     "vld1.8       {q1}, [%1]!                  \n"

+    MEMACCESS(2)

     "vld1.8       {q0}, [%2]!                  \n"

     "subs         %3, %3, #16                  \n"

     "vrhadd.u8    q0, q1                       \n"

     "vrhadd.u8    q0, q1                       \n"

+    MEMACCESS(0)

     "vst1.8       {q0}, [%0]!                  \n"

     "bgt          75b                          \n"

     "b            99f                          \n"

@@ -541,12 +595,15 @@

     // Blend 100 / 0 - Copy row unchanged.

   "100:                                        \n"

+    MEMACCESS(1)

     "vld1.8       {q0}, [%1]!                  \n"

     "subs         %3, %3, #16                  \n"

+    MEMACCESS(0)

     "vst1.8       {q0}, [%0]!                  \n"

     "bgt          100b                         \n"

   "99:                                         \n"

+    MEMACCESS(0)

     "vst1.8       {d1[7]}, [%0]                \n"

   : "+r"(dst_ptr),          // %0

     "+r"(src_ptr),          // %1

@@ -564,10 +621,14 @@

     ".p2align   2                              \n"

   "1:                                          \n"

     // load even pixels into q0, odd into q1

+    MEMACCESS(0)

     "vld2.32    {q0, q1}, [%0]!                \n"

+    MEMACCESS(0)

     "vld2.32    {q2, q3}, [%0]!                \n"

     "subs       %2, %2, #8                     \n"  // 8 processed per loop

+    MEMACCESS(1)

     "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels

+    MEMACCESS(1)

     "vst1.8     {q3}, [%1]!                    \n"

     "bgt        1b                             \n"

   : "+r"(src_ptr),          // %0

@@ -585,7 +646,9 @@

     "add        %1, %1, %0                     \n"

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

     "subs       %3, %3, #8                     \n"  // 8 processed per loop.

     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

@@ -592,7 +655,9 @@

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

     "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.

+    MEMACCESS(1)

     "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.

+    MEMACCESS(1)

     "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.

     "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.

@@ -602,6 +667,7 @@

     "vrshrn.u16 d1, q1, #2                     \n"

     "vrshrn.u16 d2, q2, #2                     \n"

     "vrshrn.u16 d3, q3, #2                     \n"

+    MEMACCESS(2)

     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"

     "bgt        1b                             \n"

   : "+r"(src_ptr),          // %0

@@ -621,11 +687,16 @@

     "mov        r12, %3, lsl #2                \n"

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.32    {d0[0]}, [%0], r12             \n"

+    MEMACCESS(0)

     "vld1.32    {d0[1]}, [%0], r12             \n"

+    MEMACCESS(0)

     "vld1.32    {d1[0]}, [%0], r12             \n"

+    MEMACCESS(0)

     "vld1.32    {d1[1]}, [%0], r12             \n"

     "subs       %2, %2, #4                     \n"  // 4 pixels per loop.

+    MEMACCESS(1)

     "vst1.8     {q0}, [%1]!                    \n"

     "bgt        1b                             \n"

   : "+r"(src_argb),    // %0

@@ -646,13 +717,21 @@

     "add        %1, %1, %0                     \n"

     ".p2align   2                              \n"

   "1:                                          \n"

+    MEMACCESS(0)

     "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1

+    MEMACCESS(1)

     "vld1.8     {d1}, [%1], r12                \n"

+    MEMACCESS(0)

     "vld1.8     {d2}, [%0], r12                \n"

+    MEMACCESS(1)

     "vld1.8     {d3}, [%1], r12                \n"

+    MEMACCESS(0)

     "vld1.8     {d4}, [%0], r12                \n"

+    MEMACCESS(1)

     "vld1.8     {d5}, [%1], r12                \n"

+    MEMACCESS(0)

     "vld1.8     {d6}, [%0], r12                \n"

+    MEMACCESS(1)

     "vld1.8     {d7}, [%1], r12                \n"

     "vaddl.u8   q0, d0, d1                     \n"

     "vaddl.u8   q1, d2, d3                     \n"

@@ -665,6 +744,7 @@

     "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.

     "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.

     "subs       %3, %3, #4                     \n"  // 4 pixels per loop.

+    MEMACCESS(2)

     "vst1.8     {q0}, [%2]!                    \n"

     "bgt        1b                             \n"

   : "+r"(src_argb),    // %0

--- a/third_party/libyuv/source/scale_posix.cc

+++ b/third_party/libyuv/source/scale_posix.cc

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

--- a/third_party/libyuv/source/scale_win.cc

+++ b/third_party/libyuv/source/scale_win.cc

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "third_party/libyuv/include/libyuv/row.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

--- /dev/null

+++ b/third_party/libyuv/source/video_common.cc

@@ -1,0 +1,64 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/video_common.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))

+struct FourCCAliasEntry {

+  uint32 alias;

+  uint32 canonical;

+};

+static const struct FourCCAliasEntry kFourCCAliases[] = {

+  {FOURCC_IYUV, FOURCC_I420},

+  {FOURCC_YU16, FOURCC_I422},

+  {FOURCC_YU24, FOURCC_I444},

+  {FOURCC_YUYV, FOURCC_YUY2},

+  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs

+  {FOURCC_HDYC, FOURCC_UYVY},

+  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8

+  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.

+  {FOURCC_DMB1, FOURCC_MJPG},

+  {FOURCC_BA81, FOURCC_BGGR},

+  {FOURCC_RGB3, FOURCC_RAW },

+  {FOURCC_BGR3, FOURCC_24BG},

+  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB

+  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB

+  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555

+  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565

+  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551

+};

+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.

+//  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA

+LIBYUV_API

+uint32 CanonicalFourCC(uint32 fourcc) {

+  int i;

+  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {

+    if (kFourCCAliases[i].alias == fourcc) {

+      return kFourCCAliases[i].canonical;

+    }

+  }

+  // Not an alias, so return it as-is.

+  return fourcc;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/x86inc.asm

@@ -1,0 +1,1136 @@

+;*****************************************************************************

+;* x86inc.asm: x264asm abstraction layer

+;*****************************************************************************

+;* Copyright (C) 2005-2012 x264 project

+;*

+;* Authors: Loren Merritt <[email protected]>

+;*          Anton Mitrofanov <[email protected]>

+;*          Jason Garrett-Glaser <[email protected]>

+;*          Henrik Gramner <[email protected]>

+;*

+;* Permission to use, copy, modify, and/or distribute this software for any

+;* purpose with or without fee is hereby granted, provided that the above

+;* copyright notice and this permission notice appear in all copies.

+;*

+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES

+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF

+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR

+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN

+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF

+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

+;*****************************************************************************

+; This is a header file for the x264ASM assembly language, which uses

+; NASM/YASM syntax combined with a large number of macros to provide easy

+; abstraction between different calling conventions (x86_32, win64, linux64).

+; It also has various other useful features to simplify writing the kind of

+; DSP functions that are most often used in x264.

+; Unlike the rest of x264, this file is available under an ISC license, as it

+; has significant usefulness outside of x264 and we want it to be available

+; to the largest audience possible.  Of course, if you modify it for your own

+; purposes to add a new feature, we strongly encourage contributing a patch

+; as this feature might be useful for others as well.  Send patches or ideas

+; to [email protected] .

+; Local changes for libyuv:

+; remove %define program_name and references in labels

+; rename cpus to uppercase

+%define WIN64  0

+%define UNIX64 0

+%if ARCH_X86_64

+    %ifidn __OUTPUT_FORMAT__,win32

+        %define WIN64  1

+    %elifidn __OUTPUT_FORMAT__,win64

+        %define WIN64  1

+    %else

+        %define UNIX64 1

+    %endif

+%endif

+%ifdef PREFIX

+    %define mangle(x) _ %+ x

+%else

+    %define mangle(x) x

+%endif

+; Name of the .rodata section.

+; Kludge: Something on OS X fails to align .rodata even given an align attribute,

+; so use a different read-only section.

+%macro SECTION_RODATA 0-1 16

+    %ifidn __OUTPUT_FORMAT__,macho64

+        SECTION .text align=%1

+    %elifidn __OUTPUT_FORMAT__,macho

+        SECTION .text align=%1

+        fakegot:

+    %elifidn __OUTPUT_FORMAT__,aout

+        section .text

+    %else

+        SECTION .rodata align=%1

+    %endif

+%endmacro

+; aout does not support align=

+%macro SECTION_TEXT 0-1 16

+    %ifidn __OUTPUT_FORMAT__,aout

+        SECTION .text

+    %else

+        SECTION .text align=%1

+    %endif

+%endmacro

+%if WIN64

+    %define PIC

+%elif ARCH_X86_64 == 0

+; x86_32 doesn't require PIC.

+; Some distros prefer shared objects to be PIC, but nothing breaks if

+; the code contains a few textrels, so we'll skip that complexity.

+    %undef PIC

+%endif

+%ifdef PIC

+    default rel

+%endif

+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)

+CPU amdnop

+; Macros to eliminate most code duplication between x86_32 and x86_64:

+; Currently this works only for leaf functions which load all their arguments

+; into registers at the start, and make no other use of the stack. Luckily that

+; covers most of x264's asm.

+; PROLOGUE:

+; %1 = number of arguments. loads them from stack if needed.

+; %2 = number of registers used. pushes callee-saved regs if needed.

+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.

+; %4 = list of names to define to registers

+; PROLOGUE can also be invoked by adding the same options to cglobal

+; e.g.

+; cglobal foo, 2,3,0, dst, src, tmp

+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)

+; TODO Some functions can use some args directly from the stack. If they're the

+; last args then you can just not declare them, but if they're in the middle

+; we need more flexible macro.

+; RET:

+; Pops anything that was pushed by PROLOGUE, and returns.

+; REP_RET:

+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons

+; which are slow when a normal ret follows a branch.

+; registers:

+; rN and rNq are the native-size register holding function argument N

+; rNd, rNw, rNb are dword, word, and byte size

+; rNh is the high 8 bits of the word size

+; rNm is the original location of arg N (a register or on the stack), dword

+; rNmp is native size

+%macro DECLARE_REG 2-3

+    %define r%1q %2

+    %define r%1d %2d

+    %define r%1w %2w

+    %define r%1b %2b

+    %define r%1h %2h

+    %if %0 == 2

+        %define r%1m  %2d

+        %define r%1mp %2

+    %elif ARCH_X86_64 ; memory

+        %define r%1m [rsp + stack_offset + %3]

+        %define r%1mp qword r %+ %1m

+    %else

+        %define r%1m [esp + stack_offset + %3]

+        %define r%1mp dword r %+ %1m

+    %endif

+    %define r%1  %2

+%endmacro

+%macro DECLARE_REG_SIZE 3

+    %define r%1q r%1

+    %define e%1q r%1

+    %define r%1d e%1

+    %define e%1d e%1

+    %define r%1w %1

+    %define e%1w %1

+    %define r%1h %3

+    %define e%1h %3

+    %define r%1b %2

+    %define e%1b %2

+%if ARCH_X86_64 == 0

+    %define r%1  e%1

+%endif

+%endmacro

+DECLARE_REG_SIZE ax, al, ah

+DECLARE_REG_SIZE bx, bl, bh

+DECLARE_REG_SIZE cx, cl, ch

+DECLARE_REG_SIZE dx, dl, dh

+DECLARE_REG_SIZE si, sil, null

+DECLARE_REG_SIZE di, dil, null

+DECLARE_REG_SIZE bp, bpl, null

+; t# defines for when per-arch register allocation is more complex than just function arguments

+%macro DECLARE_REG_TMP 1-*

+    %assign %%i 0

+    %rep %0

+        CAT_XDEFINE t, %%i, r%1

+        %assign %%i %%i+1

+        %rotate 1

+    %endrep

+%endmacro

+%macro DECLARE_REG_TMP_SIZE 0-*

+    %rep %0

+        %define t%1q t%1 %+ q

+        %define t%1d t%1 %+ d

+        %define t%1w t%1 %+ w

+        %define t%1h t%1 %+ h

+        %define t%1b t%1 %+ b

+        %rotate 1

+    %endrep

+%endmacro

+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14

+%if ARCH_X86_64

+    %define gprsize 8

+%else

+    %define gprsize 4

+%endif

+%macro PUSH 1

+    push %1

+    %assign stack_offset stack_offset+gprsize

+%endmacro

+%macro POP 1

+    pop %1

+    %assign stack_offset stack_offset-gprsize

+%endmacro

+%macro PUSH_IF_USED 1-*

+    %rep %0

+        %if %1 < regs_used

+            PUSH r%1

+        %endif

+        %rotate 1

+    %endrep

+%endmacro

+%macro POP_IF_USED 1-*

+    %rep %0

+        %if %1 < regs_used

+            pop r%1

+        %endif

+        %rotate 1

+    %endrep

+%endmacro

+%macro LOAD_IF_USED 1-*

+    %rep %0

+        %if %1 < num_args

+            mov r%1, r %+ %1 %+ mp

+        %endif

+        %rotate 1

+    %endrep

+%endmacro

+%macro SUB 2

+    sub %1, %2

+    %ifidn %1, rsp

+        %assign stack_offset stack_offset+(%2)

+    %endif

+%endmacro

+%macro ADD 2

+    add %1, %2

+    %ifidn %1, rsp

+        %assign stack_offset stack_offset-(%2)

+    %endif

+%endmacro

+%macro movifnidn 2

+    %ifnidn %1, %2

+        mov %1, %2

+    %endif

+%endmacro

+%macro movsxdifnidn 2

+    %ifnidn %1, %2

+        movsxd %1, %2

+    %endif

+%endmacro

+%macro ASSERT 1

+    %if (%1) == 0

+        %error assert failed

+    %endif

+%endmacro

+%macro DEFINE_ARGS 0-*

+    %ifdef n_arg_names

+        %assign %%i 0

+        %rep n_arg_names

+            CAT_UNDEF arg_name %+ %%i, q

+            CAT_UNDEF arg_name %+ %%i, d

+            CAT_UNDEF arg_name %+ %%i, w

+            CAT_UNDEF arg_name %+ %%i, h

+            CAT_UNDEF arg_name %+ %%i, b

+            CAT_UNDEF arg_name %+ %%i, m

+            CAT_UNDEF arg_name %+ %%i, mp

+            CAT_UNDEF arg_name, %%i

+            %assign %%i %%i+1

+        %endrep

+    %endif

+    %xdefine %%stack_offset stack_offset

+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine

+    %assign %%i 0

+    %rep %0

+        %xdefine %1q r %+ %%i %+ q

+        %xdefine %1d r %+ %%i %+ d

+        %xdefine %1w r %+ %%i %+ w

+        %xdefine %1h r %+ %%i %+ h

+        %xdefine %1b r %+ %%i %+ b

+        %xdefine %1m r %+ %%i %+ m

+        %xdefine %1mp r %+ %%i %+ mp

+        CAT_XDEFINE arg_name, %%i, %1

+        %assign %%i %%i+1

+        %rotate 1

+    %endrep

+    %xdefine stack_offset %%stack_offset

+    %assign n_arg_names %0

+%endmacro

+%if WIN64 ; Windows x64 ;=================================================

+DECLARE_REG 0,  rcx

+DECLARE_REG 1,  rdx

+DECLARE_REG 2,  R8

+DECLARE_REG 3,  R9

+DECLARE_REG 4,  R10, 40

+DECLARE_REG 5,  R11, 48

+DECLARE_REG 6,  rax, 56

+DECLARE_REG 7,  rdi, 64

+DECLARE_REG 8,  rsi, 72

+DECLARE_REG 9,  rbx, 80

+DECLARE_REG 10, rbp, 88

+DECLARE_REG 11, R12, 96

+DECLARE_REG 12, R13, 104

+DECLARE_REG 13, R14, 112

+DECLARE_REG 14, R15, 120

+%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...

+    %assign num_args %1

+    %assign regs_used %2

+    ASSERT regs_used >= num_args

+    ASSERT regs_used <= 15

+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14

+    %if mmsize == 8

+        %assign xmm_regs_used 0

+    %else

+        WIN64_SPILL_XMM %3

+    %endif

+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14

+    DEFINE_ARGS %4

+%endmacro

+%macro WIN64_SPILL_XMM 1

+    %assign xmm_regs_used %1

+    ASSERT xmm_regs_used <= 16

+    %if xmm_regs_used > 6

+        SUB rsp, (xmm_regs_used-6)*16+16

+        %assign %%i xmm_regs_used

+        %rep (xmm_regs_used-6)

+            %assign %%i %%i-1

+            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i

+        %endrep

+    %endif

+%endmacro

+%macro WIN64_RESTORE_XMM_INTERNAL 1

+    %if xmm_regs_used > 6

+        %assign %%i xmm_regs_used

+        %rep (xmm_regs_used-6)

+            %assign %%i %%i-1

+            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]

+        %endrep

+        add %1, (xmm_regs_used-6)*16+16

+    %endif

+%endmacro

+%macro WIN64_RESTORE_XMM 1

+    WIN64_RESTORE_XMM_INTERNAL %1

+    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16

+    %assign xmm_regs_used 0

+%endmacro

+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32

+%macro RET 0

+    WIN64_RESTORE_XMM_INTERNAL rsp

+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7

+%if mmsize == 32

+    vzeroupper

+%endif

+    ret

+%endmacro

+%elif ARCH_X86_64 ; *nix x64 ;=============================================

+DECLARE_REG 0,  rdi

+DECLARE_REG 1,  rsi

+DECLARE_REG 2,  rdx

+DECLARE_REG 3,  rcx

+DECLARE_REG 4,  R8

+DECLARE_REG 5,  R9

+DECLARE_REG 6,  rax, 8

+DECLARE_REG 7,  R10, 16

+DECLARE_REG 8,  R11, 24

+DECLARE_REG 9,  rbx, 32

+DECLARE_REG 10, rbp, 40

+DECLARE_REG 11, R12, 48

+DECLARE_REG 12, R13, 56

+DECLARE_REG 13, R14, 64

+DECLARE_REG 14, R15, 72

+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...

+    %assign num_args %1

+    %assign regs_used %2

+    ASSERT regs_used >= num_args

+    ASSERT regs_used <= 15

+    PUSH_IF_USED 9, 10, 11, 12, 13, 14

+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14

+    DEFINE_ARGS %4

+%endmacro

+%define has_epilogue regs_used > 9 || mmsize == 32

+%macro RET 0

+    POP_IF_USED 14, 13, 12, 11, 10, 9

+%if mmsize == 32

+    vzeroupper

+%endif

+    ret

+%endmacro

+%else ; X86_32 ;==============================================================

+DECLARE_REG 0, eax, 4

+DECLARE_REG 1, ecx, 8

+DECLARE_REG 2, edx, 12

+DECLARE_REG 3, ebx, 16

+DECLARE_REG 4, esi, 20

+DECLARE_REG 5, edi, 24

+DECLARE_REG 6, ebp, 28

+%define rsp esp

+%macro DECLARE_ARG 1-*

+    %rep %0

+        %define r%1m [esp + stack_offset + 4*%1 + 4]

+        %define r%1mp dword r%1m

+        %rotate 1

+    %endrep

+%endmacro

+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14

+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...

+    %assign num_args %1

+    %assign regs_used %2

+    %if regs_used > 7

+        %assign regs_used 7

+    %endif

+    ASSERT regs_used >= num_args

+    PUSH_IF_USED 3, 4, 5, 6

+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6

+    DEFINE_ARGS %4

+%endmacro

+%define has_epilogue regs_used > 3 || mmsize == 32

+%macro RET 0

+    POP_IF_USED 6, 5, 4, 3

+%if mmsize == 32

+    vzeroupper

+%endif

+    ret

+%endmacro

+%endif ;======================================================================

+%if WIN64 == 0

+%macro WIN64_SPILL_XMM 1

+%endmacro

+%macro WIN64_RESTORE_XMM 1

+%endmacro

+%endif

+%macro REP_RET 0

+    %if has_epilogue

+        RET

+    %else

+        rep ret

+    %endif

+%endmacro

+%macro TAIL_CALL 2 ; callee, is_nonadjacent

+    %if has_epilogue

+        call %1

+        RET

+    %elif %2

+        jmp %1

+    %endif

+%endmacro

+;=============================================================================

+; arch-independent part

+;=============================================================================

+%assign function_align 16

+; Begin a function.

+; Applies any symbol mangling needed for C linkage, and sets up a define such that

+; subsequent uses of the function name automatically refer to the mangled version.

+; Appends cpuflags to the function name if cpuflags has been specified.

+%macro cglobal 1-2+ ; name, [PROLOGUE args]

+%if %0 == 1

+    cglobal_internal %1 %+ SUFFIX

+%else

+    cglobal_internal %1 %+ SUFFIX, %2

+%endif

+%endmacro

+%macro cglobal_internal 1-2+

+    %ifndef cglobaled_%1

+        %xdefine %1 mangle(%1)

+        %xdefine %1.skip_prologue %1 %+ .skip_prologue

+        CAT_XDEFINE cglobaled_, %1, 1

+    %endif

+    %xdefine current_function %1

+    %ifidn __OUTPUT_FORMAT__,elf

+        global %1:function hidden

+    %else

+        global %1

+    %endif

+    align function_align

+    %1:

+    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer

+    %assign stack_offset 0

+    %if %0 > 1

+        PROLOGUE %2

+    %endif

+%endmacro

+%macro cextern 1

+    %xdefine %1 mangle(%1)

+    CAT_XDEFINE cglobaled_, %1, 1

+    extern %1

+%endmacro

+; like cextern, but without the prefix

+%macro cextern_naked 1

+    %xdefine %1 mangle(%1)

+    CAT_XDEFINE cglobaled_, %1, 1

+    extern %1

+%endmacro

+%macro const 2+

+    %xdefine %1 mangle(%1)

+    global %1

+    %1: %2

+%endmacro

+; This is needed for ELF, otherwise the GNU linker assumes the stack is

+; executable by default.

+%ifidn __OUTPUT_FORMAT__,elf

+SECTION .note.GNU-stack noalloc noexec nowrite progbits

+%endif

+%ifidn __OUTPUT_FORMAT__,elf32

+section .note.GNU-stack noalloc noexec nowrite progbits

+%endif

+%ifidn __OUTPUT_FORMAT__,elf64

+section .note.GNU-stack noalloc noexec nowrite progbits

+%endif

+; cpuflags

+%assign cpuflags_MMX      (1<<0)

+%assign cpuflags_MMX2     (1<<1) | cpuflags_MMX

+%assign cpuflags_3dnow    (1<<2) | cpuflags_MMX

+%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow

+%assign cpuflags_SSE      (1<<4) | cpuflags_MMX2

+%assign cpuflags_SSE2     (1<<5) | cpuflags_SSE

+%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2

+%assign cpuflags_SSE3     (1<<7) | cpuflags_SSE2

+%assign cpuflags_SSSE3    (1<<8) | cpuflags_SSE3

+%assign cpuflags_SSE4     (1<<9) | cpuflags_SSSE3

+%assign cpuflags_SSE42    (1<<10)| cpuflags_SSE4

+%assign cpuflags_AVX      (1<<11)| cpuflags_SSE42

+%assign cpuflags_xop      (1<<12)| cpuflags_AVX

+%assign cpuflags_fma4     (1<<13)| cpuflags_AVX

+%assign cpuflags_AVX2     (1<<14)| cpuflags_AVX

+%assign cpuflags_fma3     (1<<15)| cpuflags_AVX

+%assign cpuflags_cache32  (1<<16)

+%assign cpuflags_cache64  (1<<17)

+%assign cpuflags_slowctz  (1<<18)

+%assign cpuflags_lzcnt    (1<<19)

+%assign cpuflags_misalign (1<<20)

+%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant

+%assign cpuflags_atom     (1<<22)

+%assign cpuflags_bmi1     (1<<23)

+%assign cpuflags_bmi2     (1<<24)|cpuflags_bmi1

+%assign cpuflags_tbm      (1<<25)|cpuflags_bmi1

+%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))

+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))

+; Takes up to 2 cpuflags from the above list.

+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.

+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.

+%macro INIT_CPUFLAGS 0-2

+    %if %0 >= 1

+        %xdefine cpuname %1

+        %assign cpuflags cpuflags_%1

+        %if %0 >= 2

+            %xdefine cpuname %1_%2

+            %assign cpuflags cpuflags | cpuflags_%2

+        %endif

+        %xdefine SUFFIX _ %+ cpuname

+        %if cpuflag(AVX)

+            %assign AVX_enabled 1

+        %endif

+        %if mmsize == 16 && notcpuflag(SSE2)

+            %define mova movaps

+            %define movu movups

+            %define movnta movntps

+        %endif

+        %if cpuflag(aligned)

+            %define movu mova

+        %elifidn %1, SSE3

+            %define movu lddqu

+        %endif

+    %else

+        %xdefine SUFFIX

+        %undef cpuname

+        %undef cpuflags

+    %endif

+%endmacro

+; merge MMX and SSE*

+%macro CAT_XDEFINE 3

+    %xdefine %1%2 %3

+%endmacro

+%macro CAT_UNDEF 2

+    %undef %1%2

+%endmacro

+%macro INIT_MMX 0-1+

+    %assign AVX_enabled 0

+    %define RESET_MM_PERMUTATION INIT_MMX %1

+    %define mmsize 8

+    %define num_mmregs 8

+    %define mova movq

+    %define movu movq

+    %define movh movd

+    %define movnta movntq

+    %assign %%i 0

+    %rep 8

+    CAT_XDEFINE m, %%i, mm %+ %%i

+    CAT_XDEFINE nmm, %%i, %%i

+    %assign %%i %%i+1

+    %endrep

+    %rep 8

+    CAT_UNDEF m, %%i

+    CAT_UNDEF nmm, %%i

+    %assign %%i %%i+1

+    %endrep

+    INIT_CPUFLAGS %1

+%endmacro

+%macro INIT_XMM 0-1+

+    %assign AVX_enabled 0

+    %define RESET_MM_PERMUTATION INIT_XMM %1

+    %define mmsize 16

+    %define num_mmregs 8

+    %if ARCH_X86_64

+    %define num_mmregs 16

+    %endif

+    %define mova movdqa

+    %define movu movdqu

+    %define movh movq

+    %define movnta movntdq

+    %assign %%i 0

+    %rep num_mmregs

+    CAT_XDEFINE m, %%i, xmm %+ %%i

+    CAT_XDEFINE nxmm, %%i, %%i

+    %assign %%i %%i+1

+    %endrep

+    INIT_CPUFLAGS %1

+%endmacro

+%macro INIT_YMM 0-1+

+    %assign AVX_enabled 1

+    %define RESET_MM_PERMUTATION INIT_YMM %1

+    %define mmsize 32

+    %define num_mmregs 8

+    %if ARCH_X86_64

+    %define num_mmregs 16

+    %endif

+    %define mova vmovaps

+    %define movu vmovups

+    %undef movh

+    %define movnta vmovntps

+    %assign %%i 0

+    %rep num_mmregs

+    CAT_XDEFINE m, %%i, ymm %+ %%i

+    CAT_XDEFINE nymm, %%i, %%i

+    %assign %%i %%i+1

+    %endrep

+    INIT_CPUFLAGS %1

+%endmacro

+INIT_XMM

+; I often want to use macros that permute their arguments. e.g. there's no

+; efficient way to implement butterfly or transpose or dct without swapping some

+; arguments.

+;

+; I would like to not have to manually keep track of the permutations:

+; If I insert a permutation in the middle of a function, it should automatically

+; change everything that follows. For more complex macros I may also have multiple

+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.

+;

+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that

+; permutes its arguments. It's equivalent to exchanging the contents of the

+; registers, except that this way you exchange the register names instead, so it

+; doesn't cost any cycles.

+%macro PERMUTE 2-* ; takes a list of pairs to swap

+%rep %0/2

+    %xdefine tmp%2 m%2

+    %xdefine ntmp%2 nm%2

+    %rotate 2

+%endrep

+%rep %0/2

+    %xdefine m%1 tmp%2

+    %xdefine nm%1 ntmp%2

+    %undef tmp%2

+    %undef ntmp%2

+    %rotate 2

+%endrep

+%endmacro

+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)

+%rep %0-1

+%ifdef m%1

+    %xdefine tmp m%1

+    %xdefine m%1 m%2

+    %xdefine m%2 tmp

+    CAT_XDEFINE n, m%1, %1

+    CAT_XDEFINE n, m%2, %2

+%else

+    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.

+    ; Be careful using this mode in nested macros though, as in some cases there may be

+    ; other copies of m# that have already been dereferenced and don't get updated correctly.

+    %xdefine %%n1 n %+ %1

+    %xdefine %%n2 n %+ %2

+    %xdefine tmp m %+ %%n1

+    CAT_XDEFINE m, %%n1, m %+ %%n2

+    CAT_XDEFINE m, %%n2, tmp

+    CAT_XDEFINE n, m %+ %%n1, %%n1

+    CAT_XDEFINE n, m %+ %%n2, %%n2

+%endif

+    %undef tmp

+    %rotate 1

+%endrep

+%endmacro

+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later

+; calls to that function will automatically load the permutation, so values can

+; be returned in mmregs.

+%macro SAVE_MM_PERMUTATION 0-1

+    %if %0

+        %xdefine %%f %1_m

+    %else

+        %xdefine %%f current_function %+ _m

+    %endif

+    %assign %%i 0

+    %rep num_mmregs

+        CAT_XDEFINE %%f, %%i, m %+ %%i

+    %assign %%i %%i+1

+    %endrep

+%endmacro

+%macro LOAD_MM_PERMUTATION 1 ; name to load from

+    %ifdef %1_m0

+        %assign %%i 0

+        %rep num_mmregs

+            CAT_XDEFINE m, %%i, %1_m %+ %%i

+            CAT_XDEFINE n, m %+ %%i, %%i

+        %assign %%i %%i+1

+        %endrep

+    %endif

+%endmacro

+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't

+%macro call 1

+    call_internal %1, %1 %+ SUFFIX

+%endmacro

+%macro call_internal 2

+    %xdefine %%i %1

+    %ifndef cglobaled_%1

+        %ifdef cglobaled_%2

+            %xdefine %%i %2

+        %endif

+    %endif

+    call %%i

+    LOAD_MM_PERMUTATION %%i

+%endmacro

+; Substitutions that reduce instruction size but are functionally equivalent

+%macro add 2

+    %ifnum %2

+        %if %2==128

+            sub %1, -128

+        %else

+            add %1, %2

+        %endif

+    %else

+        add %1, %2

+    %endif

+%endmacro

+%macro sub 2

+    %ifnum %2

+        %if %2==128

+            add %1, -128

+        %else

+            sub %1, %2

+        %endif

+    %else

+        sub %1, %2

+    %endif

+%endmacro

+;=============================================================================

+; AVX abstraction layer

+;=============================================================================

+%assign i 0

+%rep 16

+    %if i < 8

+        CAT_XDEFINE sizeofmm, i, 8

+    %endif

+    CAT_XDEFINE sizeofxmm, i, 16

+    CAT_XDEFINE sizeofymm, i, 32

+%assign i i+1

+%endrep

+%undef i

+%macro CHECK_AVX_INSTR_EMU 3-*

+    %xdefine %%opcode %1

+    %xdefine %%dst %2

+    %rep %0-2

+        %ifidn %%dst, %3

+            %error non-AVX emulation of ``%%opcode'' is not supported

+        %endif

+        %rotate 1

+    %endrep

+%endmacro

+;%1 == instruction

+;%2 == 1 if float, 0 if int

+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)

+;%4 == number of operands given

+;%5+: operands

+%macro RUN_AVX_INSTR 6-7+

+    %ifid %6

+        %define %%sizeofreg sizeof%6

+    %elifid %5

+        %define %%sizeofreg sizeof%5

+    %else

+        %define %%sizeofreg mmsize

+    %endif

+    %if %%sizeofreg==32

+        %if %4>=3

+            v%1 %5, %6, %7

+        %else

+            v%1 %5, %6

+        %endif

+    %else

+        %if %%sizeofreg==8

+            %define %%regmov movq

+        %elif %2

+            %define %%regmov movaps

+        %else

+            %define %%regmov movdqa

+        %endif

+        %if %4>=3+%3

+            %ifnidn %5, %6

+                %if AVX_enabled && %%sizeofreg==16

+                    v%1 %5, %6, %7

+                %else

+                    CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7

+                    %%regmov %5, %6

+                    %1 %5, %7

+                %endif

+            %else

+                %1 %5, %7

+            %endif

+        %elif %4>=3

+            %1 %5, %6, %7

+        %else

+            %1 %5, %6

+        %endif

+    %endif

+%endmacro

+; 3arg AVX ops with a memory arg can only have it in src2,

+; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).

+; So, if the op is symmetric and the wrong one is memory, swap them.

+%macro RUN_AVX_INSTR1 8

+    %assign %%swap 0

+    %if AVX_enabled

+        %ifnid %6

+            %assign %%swap 1

+        %endif

+    %elifnidn %5, %6

+        %ifnid %7

+            %assign %%swap 1

+        %endif

+    %endif

+    %if %%swap && %3 == 0 && %8 == 1

+        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6

+    %else

+        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7

+    %endif

+%endmacro

+;%1 == instruction

+;%2 == 1 if float, 0 if int

+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)

+;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not

+%macro AVX_INSTR 4

+    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4

+        %ifidn %3, fnord

+            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2

+        %elifidn %4, fnord

+            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9

+        %elifidn %5, fnord

+            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4

+        %else

+            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5

+        %endif

+    %endmacro

+%endmacro

+AVX_INSTR addpd, 1, 0, 1

+AVX_INSTR addps, 1, 0, 1

+AVX_INSTR addsd, 1, 0, 1

+AVX_INSTR addss, 1, 0, 1

+AVX_INSTR addsubpd, 1, 0, 0

+AVX_INSTR addsubps, 1, 0, 0

+AVX_INSTR andpd, 1, 0, 1

+AVX_INSTR andps, 1, 0, 1

+AVX_INSTR andnpd, 1, 0, 0

+AVX_INSTR andnps, 1, 0, 0

+AVX_INSTR blendpd, 1, 0, 0

+AVX_INSTR blendps, 1, 0, 0

+AVX_INSTR blendvpd, 1, 0, 0

+AVX_INSTR blendvps, 1, 0, 0

+AVX_INSTR cmppd, 1, 0, 0

+AVX_INSTR cmpps, 1, 0, 0

+AVX_INSTR cmpsd, 1, 0, 0

+AVX_INSTR cmpss, 1, 0, 0

+AVX_INSTR cvtdq2ps, 1, 0, 0

+AVX_INSTR cvtps2dq, 1, 0, 0

+AVX_INSTR divpd, 1, 0, 0

+AVX_INSTR divps, 1, 0, 0

+AVX_INSTR divsd, 1, 0, 0

+AVX_INSTR divss, 1, 0, 0

+AVX_INSTR dppd, 1, 1, 0

+AVX_INSTR dpps, 1, 1, 0

+AVX_INSTR haddpd, 1, 0, 0

+AVX_INSTR haddps, 1, 0, 0

+AVX_INSTR hsubpd, 1, 0, 0

+AVX_INSTR hsubps, 1, 0, 0

+AVX_INSTR maxpd, 1, 0, 1

+AVX_INSTR maxps, 1, 0, 1

+AVX_INSTR maxsd, 1, 0, 1

+AVX_INSTR maxss, 1, 0, 1

+AVX_INSTR minpd, 1, 0, 1

+AVX_INSTR minps, 1, 0, 1

+AVX_INSTR minsd, 1, 0, 1

+AVX_INSTR minss, 1, 0, 1

+AVX_INSTR movhlps, 1, 0, 0

+AVX_INSTR movlhps, 1, 0, 0

+AVX_INSTR movsd, 1, 0, 0

+AVX_INSTR movss, 1, 0, 0

+AVX_INSTR mpsadbw, 0, 1, 0

+AVX_INSTR mulpd, 1, 0, 1

+AVX_INSTR mulps, 1, 0, 1

+AVX_INSTR mulsd, 1, 0, 1

+AVX_INSTR mulss, 1, 0, 1

+AVX_INSTR orpd, 1, 0, 1

+AVX_INSTR orps, 1, 0, 1

+AVX_INSTR pabsb, 0, 0, 0

+AVX_INSTR pabsw, 0, 0, 0

+AVX_INSTR pabsd, 0, 0, 0

+AVX_INSTR packsswb, 0, 0, 0

+AVX_INSTR packssdw, 0, 0, 0

+AVX_INSTR packuswb, 0, 0, 0

+AVX_INSTR packusdw, 0, 0, 0

+AVX_INSTR paddb, 0, 0, 1

+AVX_INSTR paddw, 0, 0, 1

+AVX_INSTR paddd, 0, 0, 1

+AVX_INSTR paddq, 0, 0, 1

+AVX_INSTR paddsb, 0, 0, 1

+AVX_INSTR paddsw, 0, 0, 1

+AVX_INSTR paddusb, 0, 0, 1

+AVX_INSTR paddusw, 0, 0, 1

+AVX_INSTR palignr, 0, 1, 0

+AVX_INSTR pand, 0, 0, 1

+AVX_INSTR pandn, 0, 0, 0

+AVX_INSTR pavgb, 0, 0, 1

+AVX_INSTR pavgw, 0, 0, 1

+AVX_INSTR pblendvb, 0, 0, 0

+AVX_INSTR pblendw, 0, 1, 0

+AVX_INSTR pcmpestri, 0, 0, 0

+AVX_INSTR pcmpestrm, 0, 0, 0

+AVX_INSTR pcmpistri, 0, 0, 0

+AVX_INSTR pcmpistrm, 0, 0, 0

+AVX_INSTR pcmpeqb, 0, 0, 1

+AVX_INSTR pcmpeqw, 0, 0, 1

+AVX_INSTR pcmpeqd, 0, 0, 1

+AVX_INSTR pcmpeqq, 0, 0, 1

+AVX_INSTR pcmpgtb, 0, 0, 0

+AVX_INSTR pcmpgtw, 0, 0, 0

+AVX_INSTR pcmpgtd, 0, 0, 0

+AVX_INSTR pcmpgtq, 0, 0, 0

+AVX_INSTR phaddw, 0, 0, 0

+AVX_INSTR phaddd, 0, 0, 0

+AVX_INSTR phaddsw, 0, 0, 0

+AVX_INSTR phsubw, 0, 0, 0

+AVX_INSTR phsubd, 0, 0, 0

+AVX_INSTR phsubsw, 0, 0, 0

+AVX_INSTR pmaddwd, 0, 0, 1

+AVX_INSTR pmaddubsw, 0, 0, 0

+AVX_INSTR pmaxsb, 0, 0, 1

+AVX_INSTR pmaxsw, 0, 0, 1

+AVX_INSTR pmaxsd, 0, 0, 1

+AVX_INSTR pmaxub, 0, 0, 1

+AVX_INSTR pmaxuw, 0, 0, 1

+AVX_INSTR pmaxud, 0, 0, 1

+AVX_INSTR pminsb, 0, 0, 1

+AVX_INSTR pminsw, 0, 0, 1

+AVX_INSTR pminsd, 0, 0, 1

+AVX_INSTR pminub, 0, 0, 1

+AVX_INSTR pminuw, 0, 0, 1

+AVX_INSTR pminud, 0, 0, 1

+AVX_INSTR pmovmskb, 0, 0, 0

+AVX_INSTR pmulhuw, 0, 0, 1

+AVX_INSTR pmulhrsw, 0, 0, 1

+AVX_INSTR pmulhw, 0, 0, 1

+AVX_INSTR pmullw, 0, 0, 1

+AVX_INSTR pmulld, 0, 0, 1

+AVX_INSTR pmuludq, 0, 0, 1

+AVX_INSTR pmuldq, 0, 0, 1

+AVX_INSTR por, 0, 0, 1

+AVX_INSTR psadbw, 0, 0, 1

+AVX_INSTR pshufb, 0, 0, 0

+AVX_INSTR pshufd, 0, 1, 0

+AVX_INSTR pshufhw, 0, 1, 0

+AVX_INSTR pshuflw, 0, 1, 0

+AVX_INSTR psignb, 0, 0, 0

+AVX_INSTR psignw, 0, 0, 0

+AVX_INSTR psignd, 0, 0, 0

+AVX_INSTR psllw, 0, 0, 0

+AVX_INSTR pslld, 0, 0, 0

+AVX_INSTR psllq, 0, 0, 0

+AVX_INSTR pslldq, 0, 0, 0

+AVX_INSTR psraw, 0, 0, 0

+AVX_INSTR psrad, 0, 0, 0

+AVX_INSTR psrlw, 0, 0, 0

+AVX_INSTR psrld, 0, 0, 0

+AVX_INSTR psrlq, 0, 0, 0

+AVX_INSTR psrldq, 0, 0, 0

+AVX_INSTR psubb, 0, 0, 0

+AVX_INSTR psubw, 0, 0, 0

+AVX_INSTR psubd, 0, 0, 0

+AVX_INSTR psubq, 0, 0, 0

+AVX_INSTR psubsb, 0, 0, 0

+AVX_INSTR psubsw, 0, 0, 0

+AVX_INSTR psubusb, 0, 0, 0

+AVX_INSTR psubusw, 0, 0, 0

+AVX_INSTR ptest, 0, 0, 0

+AVX_INSTR punpckhbw, 0, 0, 0

+AVX_INSTR punpckhwd, 0, 0, 0

+AVX_INSTR punpckhdq, 0, 0, 0

+AVX_INSTR punpckhqdq, 0, 0, 0

+AVX_INSTR punpcklbw, 0, 0, 0

+AVX_INSTR punpcklwd, 0, 0, 0

+AVX_INSTR punpckldq, 0, 0, 0

+AVX_INSTR punpcklqdq, 0, 0, 0

+AVX_INSTR pxor, 0, 0, 1

+AVX_INSTR shufps, 1, 1, 0

+AVX_INSTR subpd, 1, 0, 0

+AVX_INSTR subps, 1, 0, 0

+AVX_INSTR subsd, 1, 0, 0

+AVX_INSTR subss, 1, 0, 0

+AVX_INSTR unpckhpd, 1, 0, 0

+AVX_INSTR unpckhps, 1, 0, 0

+AVX_INSTR unpcklpd, 1, 0, 0

+AVX_INSTR unpcklps, 1, 0, 0

+AVX_INSTR xorpd, 1, 0, 1

+AVX_INSTR xorps, 1, 0, 1

+; 3DNow instructions, for sharing code between AVX, SSE and 3DN

+AVX_INSTR pfadd, 1, 0, 1

+AVX_INSTR pfsub, 1, 0, 0

+AVX_INSTR pfmul, 1, 0, 1

+; base-4 constants for shuffles

+%assign i 0

+%rep 256

+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)

+    %if j < 10

+        CAT_XDEFINE q000, j, i

+    %elif j < 100

+        CAT_XDEFINE q00, j, i

+    %elif j < 1000

+        CAT_XDEFINE q0, j, i

+    %else

+        CAT_XDEFINE q, j, i

+    %endif

+%assign i i+1

+%endrep

+%undef i

+%undef j

+%macro FMA_INSTR 3

+    %macro %1 4-7 %1, %2, %3

+        %if cpuflag(xop)

+            v%5 %1, %2, %3, %4

+        %else

+            %6 %1, %2, %3

+            %7 %1, %4

+        %endif

+    %endmacro

+%endmacro

+FMA_INSTR  pmacsdd,  pmulld, paddd

+FMA_INSTR  pmacsww,  pmullw, paddw

+FMA_INSTR pmadcswd, pmaddwd, paddd

+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.

+; This lets us use tzcnt without bumping the yasm version requirement yet.

+%define tzcnt rep bsf