shithub: libvpx

--- a/vp9/encoder/x86/vp9_subpel_variance.asm

+++ b/vp9/encoder/x86/vp9_subpel_variance.asm

@@ -118,6 +118,14 @@

RET

 %endmacro

+%macro INC_SRC_BY_SRC_STRIDE  0

+%if ARCH_X86=1 && CONFIG_PIC=1

+  add                srcq, src_stridemp

+%else

+  add                srcq, src_strideq

+%endif

+%endmacro

 %macro SUBPEL_VARIANCE 1-2 0 ; W

 %if cpuflag(ssse3)

 %define bilin_filter_m bilin_filter_m_ssse3

@@ -129,41 +137,85 @@

 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses

 ; 11, not 13, if the registers are ordered correctly. May make a minor speed

 ; difference on Win64

-%ifdef PIC

-%if %2 == 1 ; avg

-cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

-                                              x_offset, y_offset, \

-                                              dst, dst_stride, \

-                                              sec, sec_stride, height, sse

-%define sec_str sec_strideq

+%ifdef PIC    ; 64bit PIC

+  %if %2 == 1 ; avg

+    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

+                                      x_offset, y_offset, \

+                                      dst, dst_stride, \

+                                      sec, sec_stride, height, sse

+    %define sec_str sec_strideq

+  %else

+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \

+                                  y_offset, dst, dst_stride, height, sse

+  %endif

+  %define h heightd

+  %define bilin_filter sseq

 %else

-cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \

-                                          dst, dst_stride, height, sse

+  %if ARCH_X86=1 && CONFIG_PIC=1

+    %if %2 == 1 ; avg

+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

+                                  x_offset, y_offset, \

+                                  dst, dst_stride, \

+                                  sec, sec_stride, \

+                                  height, sse, g_bilin_filter, g_pw_8

+      %define h dword heightm

+      %define sec_str sec_stridemp

+      ;Store bilin_filter and pw_8 location in stack

+      GET_GOT eax

+      add esp, 4                ; restore esp

+      lea ecx, [GLOBAL(bilin_filter_m)]

+      mov g_bilin_filterm, ecx

+      lea ecx, [GLOBAL(pw_8)]

+      mov g_pw_8m, ecx

+      LOAD_IF_USED 0, 1         ; load eax, ecx back

+    %else

+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \

+                                y_offset, dst, dst_stride, height, sse, \

+                                g_bilin_filter, g_pw_8

+      %define h heightd

+      ;Store bilin_filter and pw_8 location in stack

+      GET_GOT eax

+      add esp, 4                ; restore esp

+      lea ecx, [GLOBAL(bilin_filter_m)]

+      mov g_bilin_filterm, ecx

+      lea ecx, [GLOBAL(pw_8)]

+      mov g_pw_8m, ecx

+      LOAD_IF_USED 0, 1         ; load eax, ecx back

+    %endif

+  %else

+    %if %2 == 1 ; avg

+      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \

+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \

+                                             x_offset, y_offset, \

+                                             dst, dst_stride, \

+                                             sec, sec_stride, \

+                                             height, sse

+      %if ARCH_X86_64

+      %define h heightd

+      %define sec_str sec_strideq

+      %else

+      %define h dword heightm

+      %define sec_str sec_stridemp

+      %endif

+    %else

+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \

+                              y_offset, dst, dst_stride, height, sse

+      %define h heightd

+    %endif

+    %define bilin_filter bilin_filter_m

+  %endif

 %endif

-%define h heightd

-%define bilin_filter sseq

-%else

-%if %2 == 1 ; avg

-cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \

-                                    7 + 2 * ARCH_X86_64, 13, src, src_stride, \

-                                                         x_offset, y_offset, \

-                                                         dst, dst_stride, \

-                                                         sec, sec_stride, \

-                                                         height, sse

-%if ARCH_X86_64

-%define h heightd

-%define sec_str sec_strideq

-%else

-%define h dword heightm

-%define sec_str sec_stridemp

-%endif

-%else

-cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \

-                                          dst, dst_stride, height, sse

-%define h heightd

-%endif

-%define bilin_filter bilin_filter_m

-%endif

   ASSERT               %1 <= 16         ; m6 overflows if w > 16

   pxor                 m6, m6           ; sum

   pxor                 m7, m7           ; sse

@@ -329,11 +381,22 @@

 %define filter_y_b m9

 %define filter_rnd m10

 %else ; x86-32 or mmx

+%if ARCH_X86=1 && CONFIG_PIC=1

+; x_offset == 0, reuse x_offset reg

+%define tempq x_offsetq

+  add y_offsetq, g_bilin_filterm

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

   add           y_offsetq, bilin_filter

 %define filter_y_a [y_offsetq]

 %define filter_y_b [y_offsetq+16]

 %define filter_rnd [pw_8]

 %endif

+%endif

 .x_zero_y_other_loop:

 %if %1 == 16

   movu                 m0, [srcq]

@@ -615,6 +678,15 @@

 %define filter_y_a m8

 %define filter_y_b m9

 %define filter_rnd m10

+%else  ;x86_32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; x_offset == 0.5. We can reuse x_offset reg

+%define tempq x_offsetq

+  add y_offsetq, g_bilin_filterm

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

 %else

   add           y_offsetq, bilin_filter

 %define filter_y_a [y_offsetq]

@@ -621,6 +693,8 @@

 %define filter_y_b [y_offsetq+16]

 %define filter_rnd [pw_8]

 %endif

+%endif

 %if %1 == 16

   movu                 m0, [srcq]

   movu                 m3, [srcq+1]

@@ -752,6 +826,15 @@

 %define filter_x_a m8

 %define filter_x_b m9

 %define filter_rnd m10

+%else    ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+;y_offset == 0. We can reuse y_offset reg.

+%define tempq y_offsetq

+  add x_offsetq, g_bilin_filterm

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

 %else

   add           x_offsetq, bilin_filter

 %define filter_x_a [x_offsetq]

@@ -758,6 +841,8 @@

 %define filter_x_b [x_offsetq+16]

 %define filter_rnd [pw_8]

 %endif

+%endif

 .x_other_y_zero_loop:

 %if %1 == 16

   movu                 m0, [srcq]

@@ -873,6 +958,15 @@

 %define filter_x_a m8

 %define filter_x_b m9

 %define filter_rnd m10

+%else    ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; y_offset == 0.5. We can reuse y_offset reg.

+%define tempq y_offsetq

+  add x_offsetq, g_bilin_filterm

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

 %else

   add           x_offsetq, bilin_filter

 %define filter_x_a [x_offsetq]

@@ -879,6 +973,8 @@

 %define filter_x_b [x_offsetq+16]

 %define filter_rnd [pw_8]

 %endif

+%endif

 %if %1 == 16

   movu                 m0, [srcq]

   movu                 m1, [srcq+1]

@@ -1057,6 +1153,21 @@

 %define filter_y_a m10

 %define filter_y_b m11

 %define filter_rnd m12

+%else   ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; In this case, there is NO unused register. Used src_stride register. Later,

+; src_stride has to be loaded from stack when it is needed.

+%define tempq src_strideq

+  mov tempq, g_bilin_filterm

+  add           x_offsetq, tempq

+  add           y_offsetq, tempq

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

 %else

   add           x_offsetq, bilin_filter

   add           y_offsetq, bilin_filter

@@ -1066,6 +1177,8 @@

 %define filter_y_b [y_offsetq+16]

 %define filter_rnd [pw_8]

 %endif

+%endif

   ; x_offset == bilin interpolation && y_offset == bilin interpolation

 %if %1 == 16

   movu                 m0, [srcq]

@@ -1093,7 +1206,9 @@

 %endif

   psraw                m0, 4

   psraw                m2, 4

-  add                srcq, src_strideq

+  INC_SRC_BY_SRC_STRIDE

   packuswb             m0, m2

 .x_other_y_other_loop:

 %if cpuflag(ssse3)

@@ -1163,7 +1278,7 @@

   SUM_SSE              m0, m1, m2, m3, m6, m7

   mova                 m0, m4

-  add                srcq, src_strideq

+  INC_SRC_BY_SRC_STRIDE

   add                dstq, dst_strideq

 %else ; %1 < 16

   movh                 m0, [srcq]

@@ -1184,12 +1299,17 @@

 %if cpuflag(ssse3)

   packuswb             m0, m0

 %endif

-  add                srcq, src_strideq

+  INC_SRC_BY_SRC_STRIDE

 .x_other_y_other_loop:

   movh                 m2, [srcq]

   movh                 m1, [srcq+1]

-  movh                 m4, [srcq+src_strideq]

-  movh                 m3, [srcq+src_strideq+1]

+  INC_SRC_BY_SRC_STRIDE

+  movh                 m4, [srcq]

+  movh                 m3, [srcq+1]

 %if cpuflag(ssse3)

   punpcklbw            m2, m1

   punpcklbw            m4, m3

@@ -1253,7 +1373,7 @@

   SUM_SSE              m0, m1, m2, m3, m6, m7

   mova                 m0, m4

-  lea                srcq, [srcq+src_strideq*2]

+  INC_SRC_BY_SRC_STRIDE

   lea                dstq, [dstq+dst_strideq*2]

 %endif

 %if %2 == 1 ; avg