shithub: openh264

Download patch

ref: 2011a7407e64a0a9300337831c71f8ce6bd2db1d
parent: 0da712380b2474fcb8dcbde5307ff916ce2daace
parent: 856f186b8eeb361a9be46b3706d87ea9f2ad6190
author: Ethan Hugg <[email protected]>
date: Fri Jan 17 09:13:24 EST 2014

Merge pull request #147 from volvet/illegal_assembly_fix

fix illegal instruction use

--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -60,9 +60,9 @@
 %ifdef  WIN64
 
 
-WELS_EXTERN   DeblockLumaLt4V_sse2
+WELS_EXTERN   DeblockLumaLt4V_ssse3
 
-DeblockLumaLt4V_sse2:
+DeblockLumaLt4V_ssse3:
   push        rbp
   mov         r11,[rsp + 16 + 20h]  ; pTC
   sub         rsp,1B0h
@@ -317,10 +317,10 @@
   ret
 
 
-WELS_EXTERN   DeblockLumaEq4V_sse2
+WELS_EXTERN   DeblockLumaEq4V_ssse3
 
 ALIGN  16
-DeblockLumaEq4V_sse2:
+DeblockLumaEq4V_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -780,10 +780,10 @@
   ret
 
 
-WELS_EXTERN  DeblockChromaLt4V_sse2
+WELS_EXTERN  DeblockChromaLt4V_ssse3
 
 ALIGN  16
-DeblockChromaLt4V_sse2:
+DeblockChromaLt4V_ssse3:
   mov         rax,rsp
   push        rbx
   push        rdi
@@ -942,9 +942,9 @@
   ret
 
 
-WELS_EXTERN   DeblockChromaEq4V_sse2
+WELS_EXTERN   DeblockChromaEq4V_ssse3
 ALIGN 16
-DeblockChromaEq4V_sse2:
+DeblockChromaEq4V_ssse3:
   mov         rax,rsp
   push        rbx
   sub         rsp,90h
@@ -1096,9 +1096,9 @@
 
 
 
-WELS_EXTERN   DeblockChromaEq4H_sse2
+WELS_EXTERN   DeblockChromaEq4H_ssse3
 ALIGN  16
-DeblockChromaEq4H_sse2:
+DeblockChromaEq4H_ssse3:
   mov         rax,rsp
   mov         [rax+20h],rbx
   push        rdi
@@ -1360,9 +1360,9 @@
 
 
 
-WELS_EXTERN DeblockChromaLt4H_sse2
+WELS_EXTERN DeblockChromaLt4H_ssse3
 ALIGN  16
-DeblockChromaLt4H_sse2:
+DeblockChromaLt4H_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -1646,9 +1646,9 @@
 %elifdef  UNIX64
 
 
-WELS_EXTERN   DeblockLumaLt4V_sse2
+WELS_EXTERN   DeblockLumaLt4V_ssse3
 
-DeblockLumaLt4V_sse2:
+DeblockLumaLt4V_ssse3:
   push        rbp
   mov         r11,r8  ; pTC
   sub         rsp,1B0h
@@ -1903,10 +1903,10 @@
   ret
 
 
-WELS_EXTERN DeblockLumaEq4V_sse2
+WELS_EXTERN DeblockLumaEq4V_ssse3
 
 ALIGN  16
-DeblockLumaEq4V_sse2:
+DeblockLumaEq4V_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -2365,9 +2365,9 @@
   pop         rbx
   ret
 
-WELS_EXTERN  DeblockChromaLt4V_sse2
+WELS_EXTERN  DeblockChromaLt4V_ssse3
 ALIGN  16
-DeblockChromaLt4V_sse2:
+DeblockChromaLt4V_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -2533,9 +2533,9 @@
   pop         rbx
   ret
 
-WELS_EXTERN   DeblockChromaEq4V_sse2
-ALIGN 16
-DeblockChromaEq4V_sse2:
+WELS_EXTERN DeblockChromaEq4V_ssse3
+
+DeblockChromaEq4V_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -2684,10 +2684,10 @@
   pop         rbx
   ret
 
+WELS_EXTERN DeblockChromaEq4H_ssse3
 
-WELS_EXTERN   DeblockChromaEq4H_sse2
 ALIGN  16
-DeblockChromaEq4H_sse2:
+DeblockChromaEq4H_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -2959,9 +2959,9 @@
   ret
 
 
-WELS_EXTERN DeblockChromaLt4H_sse2
+WELS_EXTERN DeblockChromaLt4H_ssse3
 ALIGN  16
-DeblockChromaLt4H_sse2:
+DeblockChromaLt4H_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -3252,13 +3252,13 @@
 %elifdef  X86_32
 
 ;********************************************************************************
-;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;  void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;                             int32_t iAlpha, int32_t iBeta)
 ;********************************************************************************
-WELS_EXTERN   DeblockChromaEq4V_sse2
+WELS_EXTERN   DeblockChromaEq4V_ssse3
 
 ALIGN  16
-DeblockChromaEq4V_sse2:
+DeblockChromaEq4V_ssse3:
   push        ebp
   mov         ebp,esp
   and         esp,0FFFFFFF0h
@@ -3421,13 +3421,13 @@
   ret
 
 ;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
 ;*******************************************************************************
 
-WELS_EXTERN  DeblockChromaLt4V_sse2
+WELS_EXTERN  DeblockChromaLt4V_ssse3
 
-DeblockChromaLt4V_sse2:
+DeblockChromaLt4V_ssse3:
   push        ebp
   mov         ebp,esp
   and         esp,0FFFFFFF0h
@@ -3624,15 +3624,15 @@
   ret
 
 ;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;  void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;          int32_t iAlpha, int32_t iBeta)
 ;***************************************************************************
 
-WELS_EXTERN     DeblockChromaEq4H_sse2
+WELS_EXTERN     DeblockChromaEq4H_ssse3
 
 ALIGN  16
 
-DeblockChromaEq4H_sse2:
+DeblockChromaEq4H_ssse3:
   push        ebp
   mov         ebp,esp
   and         esp,0FFFFFFF0h
@@ -3909,15 +3909,15 @@
   ret
 
 ;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
 ;*******************************************************************************
 
-WELS_EXTERN  DeblockChromaLt4H_sse2
+WELS_EXTERN  DeblockChromaLt4H_ssse3
 
 ALIGN  16
 
-DeblockChromaLt4H_sse2:
+DeblockChromaLt4H_ssse3:
   push        ebp
   mov         ebp,esp
   and         esp,0FFFFFFF0h
@@ -4224,16 +4224,16 @@
 
 
 ;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;    void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
 ;                                 int32_t iBeta, int8_t * pTC)
 ;*******************************************************************************
 
 
-WELS_EXTERN  DeblockLumaLt4V_sse2
+WELS_EXTERN  DeblockLumaLt4V_ssse3
 
 ALIGN  16
 
-DeblockLumaLt4V_sse2:
+DeblockLumaLt4V_ssse3:
     push	ebp
 	mov	ebp, esp
 	and	esp, -16				; fffffff0H
@@ -4616,15 +4616,15 @@
 
 
 ;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;    void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
 ;                                 int32_t iBeta)
 ;*******************************************************************************
 
-WELS_EXTERN  DeblockLumaEq4V_sse2
+WELS_EXTERN  DeblockLumaEq4V_ssse3
 
 ALIGN  16
 
-DeblockLumaEq4V_sse2:
+DeblockLumaEq4V_ssse3:
 
 	push	ebp
 	mov	ebp, esp
--- a/codec/common/deblocking_common.cpp
+++ b/codec/common/deblocking_common.cpp
@@ -183,19 +183,19 @@
 
 #ifdef X86_ASM
 extern "C" {
-  void DeblockLumaLt4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
+  void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
     FORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
 
     DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
-    DeblockLumaLt4V_sse2 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
+    DeblockLumaLt4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
     DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
   }
 
-  void DeblockLumaEq4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
     FORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
 
     DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
-    DeblockLumaEq4V_sse2 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
+    DeblockLumaEq4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
     DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
   }
 
--- a/codec/common/deblocking_common.h
+++ b/codec/common/deblocking_common.h
@@ -20,17 +20,17 @@
 #endif//__cplusplus
 
 #ifdef  X86_ASM
-void DeblockLumaLt4V_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4V_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockLumaLt4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 void DeblockLumaTransposeH2V_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
 void DeblockLumaTransposeV2H_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
-void DeblockLumaLt4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaEq4V_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4V_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
                              int8_t* pTC);
-void DeblockChromaEq4H_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4H_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
                              int8_t* pTC);
 #endif
 #if defined(__cplusplus)
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -708,15 +708,15 @@
   pFunc->pfChromaDeblockinEQ4Hor	    = DeblockChromaEq4H_c;
 
 #ifdef X86_ASM
-  if (iCpu & WELS_CPU_SSE2) {
-    pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_sse2;
-    pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_sse2;
-    pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_sse2;
-    pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_sse2;
-    pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_sse2;
-    pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_sse2;
-    pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_sse2;
-    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_sse2;
+  if (iCpu & WELS_CPU_SSSE3) {
+    pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_ssse3;
+    pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_ssse3;
+    pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_ssse3;
+    pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_ssse3;
+    pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_ssse3;
+    pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_ssse3;
+    pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_ssse3;
+    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_ssse3;
   }
 #endif
 
--- a/codec/encoder/core/src/deblocking.cpp
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -787,15 +787,15 @@
 
 
 #ifdef X86_ASM
-  if (iCpu & WELS_CPU_SSE2) {
-    pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_sse2;
-    pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_sse2;
-    pFunc->pfLumaDeblockingLT4Hor       = DeblockLumaLt4H_sse2;
-    pFunc->pfLumaDeblockingEQ4Hor       = DeblockLumaEq4H_sse2;
-    pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_sse2;
-    pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_sse2;
-    pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_sse2;
-    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_sse2;
+  if (iCpu & WELS_CPU_SSSE3) {
+    pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_ssse3;
+    pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_ssse3;
+    pFunc->pfLumaDeblockingLT4Hor       = DeblockLumaLt4H_ssse3;
+    pFunc->pfLumaDeblockingEQ4Hor       = DeblockLumaEq4H_ssse3;
+    pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_ssse3;
+    pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_ssse3;
+    pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_ssse3;
+    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_ssse3;
   }
 #endif
 }