shithub: openh264

Download patch

ref: d48a90da0da99b87d4f4db38a380217b34b28a0b
parent: 02e824d1253cdf8800c51fb39a359a062cbb0f45
parent: ea4bb892aa149695b43e0b4a90bdb638b4b8842d
author: guangwei <[email protected]>
date: Wed Apr 12 06:16:41 EDT 2017

Merge pull request #2721 from mstorsjo/fix-arm-rounding

Fix arm downsampler to add horizontally first

--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -57,10 +57,11 @@
 
     vld1.8 {q0,q1}, [r2]!
     vld1.8 {q2,q3}, [r7]!
-    vrhadd.u8 q0, q0, q2
-    vrhadd.u8 q1, q1, q3
     vuzp.8 q0, q1
+    vuzp.8 q2, q3
     vrhadd.u8 q0, q0, q1
+    vrhadd.u8 q2, q2, q3
+    vrhadd.u8 q0, q0, q2
     vst1.32 {q0},   [r0]!
     add lr, #32
 
@@ -188,10 +189,11 @@
 
     vld1.8 {q0,q1}, [r2]!
     vld1.8 {q2,q3}, [r7]!
-    vrhadd.u8 q0, q0, q2
-    vrhadd.u8 q1, q1, q3
     vuzp.8 q0, q1
+    vuzp.8 q2, q3
     vrhadd.u8 q0, q0, q1
+    vrhadd.u8 q2, q2, q3
+    vrhadd.u8 q0, q0, q2
     vst1.32 {q0},   [r0]!
     subs r6, #1
     bne comp_ds_bilinear_w_x32_loop1
--- a/codec/processing/src/arm64/down_sample_aarch64_neon.S
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@@ -51,11 +51,13 @@
 
     ld1     {v0.16b, v1.16b}, [x2], #32
     ld1     {v2.16b, v3.16b}, [x7], #32
-    urhadd  v0.16b, v0.16b, v2.16b
-    urhadd  v1.16b, v1.16b, v3.16b
-    uzp1    v2.16b, v0.16b, v1.16b
-    uzp2    v3.16b, v0.16b, v1.16b
-    urhadd  v2.16b, v2.16b, v3.16b
+    uzp1    v4.16b, v0.16b, v1.16b
+    uzp2    v5.16b, v0.16b, v1.16b
+    uzp1    v6.16b, v2.16b, v3.16b
+    uzp2    v7.16b, v2.16b, v3.16b
+    urhadd  v0.16b, v4.16b, v5.16b
+    urhadd  v1.16b, v6.16b, v7.16b
+    urhadd  v2.16b, v0.16b, v1.16b
     st1     {v2.16b}, [x0], #16
     add     w9, w9, #32
 
@@ -92,11 +94,13 @@
 
     ld1     {v0.16b, v1.16b}, [x2], #32
     ld1     {v2.16b, v3.16b}, [x7], #32
-    urhadd  v0.16b, v0.16b, v2.16b
-    urhadd  v1.16b, v1.16b, v3.16b
-    uzp1    v2.16b, v0.16b, v1.16b
-    uzp2    v3.16b, v0.16b, v1.16b
-    urhadd  v2.16b, v2.16b, v3.16b
+    uzp1    v4.16b, v0.16b, v1.16b
+    uzp2    v5.16b, v0.16b, v1.16b
+    uzp1    v6.16b, v2.16b, v3.16b
+    uzp2    v7.16b, v2.16b, v3.16b
+    urhadd  v0.16b, v4.16b, v5.16b
+    urhadd  v1.16b, v6.16b, v7.16b
+    urhadd  v2.16b, v0.16b, v1.16b
     st1     {v2.16b}, [x0], #16
 
     sub     w6, w6, #1