ref: 2011a7407e64a0a9300337831c71f8ce6bd2db1d
parent: 0da712380b2474fcb8dcbde5307ff916ce2daace
parent: 856f186b8eeb361a9be46b3706d87ea9f2ad6190
author: Ethan Hugg <[email protected]>
date: Fri Jan 17 09:13:24 EST 2014
Merge pull request #147 from volvet/illegal_assembly_fix fix illegal instruction use
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -60,9 +60,9 @@
%ifdef WIN64
-WELS_EXTERN DeblockLumaLt4V_sse2
+WELS_EXTERN DeblockLumaLt4V_ssse3
-DeblockLumaLt4V_sse2:
+DeblockLumaLt4V_ssse3:
push rbp
mov r11,[rsp + 16 + 20h] ; pTC
sub rsp,1B0h
@@ -317,10 +317,10 @@
ret
-WELS_EXTERN DeblockLumaEq4V_sse2
+WELS_EXTERN DeblockLumaEq4V_ssse3
ALIGN 16
-DeblockLumaEq4V_sse2:
+DeblockLumaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -780,10 +780,10 @@
ret
-WELS_EXTERN DeblockChromaLt4V_sse2
+WELS_EXTERN DeblockChromaLt4V_ssse3
ALIGN 16
-DeblockChromaLt4V_sse2:
+DeblockChromaLt4V_ssse3:
mov rax,rsp
push rbx
push rdi
@@ -942,9 +942,9 @@
ret
-WELS_EXTERN DeblockChromaEq4V_sse2
+WELS_EXTERN DeblockChromaEq4V_ssse3
ALIGN 16
-DeblockChromaEq4V_sse2:
+DeblockChromaEq4V_ssse3:
mov rax,rsp
push rbx
sub rsp,90h
@@ -1096,9 +1096,9 @@
-WELS_EXTERN DeblockChromaEq4H_sse2
+WELS_EXTERN DeblockChromaEq4H_ssse3
ALIGN 16
-DeblockChromaEq4H_sse2:
+DeblockChromaEq4H_ssse3:
mov rax,rsp
mov [rax+20h],rbx
push rdi
@@ -1360,9 +1360,9 @@
-WELS_EXTERN DeblockChromaLt4H_sse2
+WELS_EXTERN DeblockChromaLt4H_ssse3
ALIGN 16
-DeblockChromaLt4H_sse2:
+DeblockChromaLt4H_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -1646,9 +1646,9 @@
%elifdef UNIX64
-WELS_EXTERN DeblockLumaLt4V_sse2
+WELS_EXTERN DeblockLumaLt4V_ssse3
-DeblockLumaLt4V_sse2:
+DeblockLumaLt4V_ssse3:
push rbp
mov r11,r8 ; pTC
sub rsp,1B0h
@@ -1903,10 +1903,10 @@
ret
-WELS_EXTERN DeblockLumaEq4V_sse2
+WELS_EXTERN DeblockLumaEq4V_ssse3
ALIGN 16
-DeblockLumaEq4V_sse2:
+DeblockLumaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -2365,9 +2365,9 @@
pop rbx
ret
-WELS_EXTERN DeblockChromaLt4V_sse2
+WELS_EXTERN DeblockChromaLt4V_ssse3
ALIGN 16
-DeblockChromaLt4V_sse2:
+DeblockChromaLt4V_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -2533,9 +2533,9 @@
pop rbx
ret
-WELS_EXTERN DeblockChromaEq4V_sse2
-ALIGN 16
-DeblockChromaEq4V_sse2:
+WELS_EXTERN DeblockChromaEq4V_ssse3
+
+DeblockChromaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -2684,10 +2684,10 @@
pop rbx
ret
+WELS_EXTERN DeblockChromaEq4H_ssse3
-WELS_EXTERN DeblockChromaEq4H_sse2
ALIGN 16
-DeblockChromaEq4H_sse2:
+DeblockChromaEq4H_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -2959,9 +2959,9 @@
ret
-WELS_EXTERN DeblockChromaLt4H_sse2
+WELS_EXTERN DeblockChromaLt4H_ssse3
ALIGN 16
-DeblockChromaLt4H_sse2:
+DeblockChromaLt4H_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -3252,13 +3252,13 @@
%elifdef X86_32
;********************************************************************************
-; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta)
;********************************************************************************
-WELS_EXTERN DeblockChromaEq4V_sse2
+WELS_EXTERN DeblockChromaEq4V_ssse3
ALIGN 16
-DeblockChromaEq4V_sse2:
+DeblockChromaEq4V_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@@ -3421,13 +3421,13 @@
ret
;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
-WELS_EXTERN DeblockChromaLt4V_sse2
+WELS_EXTERN DeblockChromaLt4V_ssse3
-DeblockChromaLt4V_sse2:
+DeblockChromaLt4V_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@@ -3624,15 +3624,15 @@
ret
;***************************************************************************
-; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta)
;***************************************************************************
-WELS_EXTERN DeblockChromaEq4H_sse2
+WELS_EXTERN DeblockChromaEq4H_ssse3
ALIGN 16
-DeblockChromaEq4H_sse2:
+DeblockChromaEq4H_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@@ -3909,15 +3909,15 @@
ret
;*******************************************************************************
-; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
-WELS_EXTERN DeblockChromaLt4H_sse2
+WELS_EXTERN DeblockChromaLt4H_ssse3
ALIGN 16
-DeblockChromaLt4H_sse2:
+DeblockChromaLt4H_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@@ -4224,16 +4224,16 @@
;*******************************************************************************
-; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
; int32_t iBeta, int8_t * pTC)
;*******************************************************************************
-WELS_EXTERN DeblockLumaLt4V_sse2
+WELS_EXTERN DeblockLumaLt4V_ssse3
ALIGN 16
-DeblockLumaLt4V_sse2:
+DeblockLumaLt4V_ssse3:
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
@@ -4616,15 +4616,15 @@
;*******************************************************************************
-; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
; int32_t iBeta)
;*******************************************************************************
-WELS_EXTERN DeblockLumaEq4V_sse2
+WELS_EXTERN DeblockLumaEq4V_ssse3
ALIGN 16
-DeblockLumaEq4V_sse2:
+DeblockLumaEq4V_ssse3:
push ebp
mov ebp, esp
--- a/codec/common/deblocking_common.cpp
+++ b/codec/common/deblocking_common.cpp
@@ -183,19 +183,19 @@
#ifdef X86_ASM
extern "C" {
- void DeblockLumaLt4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
+ void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
FORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
- DeblockLumaLt4V_sse2 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
+ DeblockLumaLt4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
}
- void DeblockLumaEq4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+ void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
FORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
- DeblockLumaEq4V_sse2 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
+ DeblockLumaEq4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
}
--- a/codec/common/deblocking_common.h
+++ b/codec/common/deblocking_common.h
@@ -20,17 +20,17 @@
#endif//__cplusplus
#ifdef X86_ASM
-void DeblockLumaLt4V_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4V_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockLumaLt4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockLumaTransposeH2V_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
void DeblockLumaTransposeV2H_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
-void DeblockLumaLt4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4H_sse2 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaEq4V_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4V_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
int8_t* pTC);
-void DeblockChromaEq4H_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4H_sse2 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
int8_t* pTC);
#endif
#if defined(__cplusplus)
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -708,15 +708,15 @@
pFunc->pfChromaDeblockinEQ4Hor = DeblockChromaEq4H_c;
#ifdef X86_ASM
- if (iCpu & WELS_CPU_SSE2) {
- pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_sse2;
- pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_sse2;
- pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_sse2;
- pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_sse2;
- pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_sse2;
- pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_sse2;
- pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_sse2;
- pFunc->pfChromaDeblockinEQ4Hor = DeblockChromaEq4H_sse2;
+ if (iCpu & WELS_CPU_SSSE3) {
+ pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_ssse3;
+ pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_ssse3;
+ pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_ssse3;
+ pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_ssse3;
+ pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_ssse3;
+ pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_ssse3;
+ pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_ssse3;
+ pFunc->pfChromaDeblockinEQ4Hor = DeblockChromaEq4H_ssse3;
}
#endif
--- a/codec/encoder/core/src/deblocking.cpp
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -787,15 +787,15 @@
#ifdef X86_ASM
- if (iCpu & WELS_CPU_SSE2) {
- pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_sse2;
- pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_sse2;
- pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_sse2;
- pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_sse2;
- pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_sse2;
- pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_sse2;
- pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_sse2;
- pFunc->pfChromaDeblockinEQ4Hor = DeblockChromaEq4H_sse2;
+ if (iCpu & WELS_CPU_SSSE3) {
+ pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_ssse3;
+ pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_ssse3;
+ pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_ssse3;
+ pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_ssse3;
+ pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_ssse3;
+ pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_ssse3;
+ pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_ssse3;
+ pFunc->pfChromaDeblockinEQ4Hor = DeblockChromaEq4H_ssse3;
}
#endif
}