123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316 |
- From 7361ef732b432e153496c30da66081d7e530c7f6 Mon Sep 17 00:00:00 2001
- From: Peter de Rivaz <peter.derivaz@argondesign.com>
- Date: Mon, 14 Dec 2015 16:35:29 +0000
- Subject: [PATCH] Fix for issue 1114 compile error
- In 32-bit build with --enable-shared, there is a lot of
- register pressure and register src_strideq is reused.
- The code needs to use the stack based version of src_stride,
- but this doesn't compile when used in an lea instruction.
- This patch also fixes a related segmentation fault caused by the
- implementation using src_strideq even though it has been
- reused.
- This patch also fixes the HBD subpel variance tests that fail
- when compiled without disable-optimizations.
- These failures were caused by local variables in the assembler
- routines colliding with the caller's stack frame.
- Change-Id: Ice9d4dafdcbdc6038ad5ee7c1c09a8f06deca362
- ---
- vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm | 18 +++----
- vpx_dsp/x86/highbd_variance_sse2.c | 64 ++++++++++++++----------
- 2 files changed, 44 insertions(+), 38 deletions(-)
- diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
- index 22d52a2..30ee81b 100644
- --- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
- +++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
- @@ -79,20 +79,13 @@ SECTION .text
-
- %macro INC_SRC_BY_SRC_STRIDE 0
- %if ARCH_X86=1 && CONFIG_PIC=1
- - lea srcq, [srcq + src_stridemp*2]
- + add srcq, src_stridemp
- + add srcq, src_stridemp
- %else
- lea srcq, [srcq + src_strideq*2]
- %endif
- %endmacro
-
- -%macro INC_SRC_BY_SRC_2STRIDE 0
- -%if ARCH_X86=1 && CONFIG_PIC=1
- - lea srcq, [srcq + src_stridemp*4]
- -%else
- - lea srcq, [srcq + src_strideq*4]
- -%endif
- -%endmacro
- -
- %macro SUBPEL_VARIANCE 1-2 0 ; W
- %define bilin_filter_m bilin_filter_m_sse2
- %define filter_idx_shift 5
- @@ -984,8 +977,9 @@ SECTION .text
- .x_other_y_other_loop:
- movu m2, [srcq]
- movu m4, [srcq+2]
- - movu m3, [srcq+src_strideq*2]
- - movu m5, [srcq+src_strideq*2+2]
- + INC_SRC_BY_SRC_STRIDE
- + movu m3, [srcq]
- + movu m5, [srcq+2]
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m2, filter_rnd
- @@ -1018,7 +1012,7 @@ SECTION .text
- SUM_SSE m0, m2, m4, m3, m6, m7
- mova m0, m5
-
- - INC_SRC_BY_SRC_2STRIDE
- + INC_SRC_BY_SRC_STRIDE
- lea dstq, [dstq + dst_strideq * 4]
- %if %2 == 1 ; avg
- add secq, sec_str
- diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
- index b45331c..81ec5db 100644
- --- a/vpx_dsp/x86/highbd_variance_sse2.c
- +++ b/vpx_dsp/x86/highbd_variance_sse2.c
- @@ -243,13 +243,18 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
- }
-
- #if CONFIG_USE_X86INC
- +// The 2 unused parameters are place holders for PIC enabled build.
- +// These definitions are for functions defined in
- +// highbd_subpel_variance_impl_sse2.asm
- #define DECL(w, opt) \
- int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint16_t *dst, \
- ptrdiff_t dst_stride, \
- - int height, unsigned int *sse);
- + int height, \
- + unsigned int *sse, \
- + void *unused0, void *unused);
- #define DECLS(opt1, opt2) \
- DECL(8, opt1); \
- DECL(16, opt1)
- @@ -274,7 +279,7 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
- int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, h, \
- - &sse); \
- + &sse, NULL, NULL); \
- if (w > wf) { \
- unsigned int sse2; \
- int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
- @@ -282,19 +287,20 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
- x_offset, y_offset, \
- dst + 16, \
- dst_stride, \
- - h, &sse2); \
- + h, &sse2, \
- + NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- - h, &sse2); \
- + h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, \
- - dst + 48, dst_stride, h, &sse2); \
- + dst + 48, dst_stride, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- @@ -312,7 +318,7 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
- int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- - h, &sse); \
- + h, &sse, NULL, NULL); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
- @@ -320,20 +326,21 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
- x_offset, y_offset, \
- dst + 16, \
- dst_stride, \
- - h, &sse2); \
- + h, &sse2, \
- + NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- - h, &sse2); \
- + h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
- x_offset, y_offset, \
- dst + 48, dst_stride, \
- - h, &sse2); \
- + h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- @@ -359,27 +366,27 @@ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
- int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + (start_row * dst_stride), \
- - dst_stride, height, &sse2); \
- + dst_stride, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf) { \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
- - dst_stride, height, &sse2); \
- + dst_stride, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
- - dst_stride, height, &sse2); \
- + dst_stride, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
- - dst_stride, height, &sse2); \
- + dst_stride, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- }\
- @@ -410,6 +417,7 @@ FNS(sse2, sse);
- #undef FNS
- #undef FN
-
- +// The 2 unused parameters are place holders for PIC enabled build.
- #define DECL(w, opt) \
- int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
- ptrdiff_t src_stride, \
- @@ -419,7 +427,8 @@ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
- const uint16_t *sec, \
- ptrdiff_t sec_stride, \
- int height, \
- - unsigned int *sse);
- + unsigned int *sse, \
- + void *unused0, void *unused);
- #define DECLS(opt1) \
- DECL(16, opt1) \
- DECL(8, opt1)
- @@ -439,23 +448,23 @@ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
- uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
- int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src, src_stride, x_offset, \
- - y_offset, dst, dst_stride, sec, w, h, &sse); \
- + y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, \
- - dst + 16, dst_stride, sec + 16, w, h, &sse2); \
- + dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, \
- - dst + 32, dst_stride, sec + 32, w, h, &sse2); \
- + dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, \
- - dst + 48, dst_stride, sec + 48, w, h, &sse2); \
- + dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- @@ -475,14 +484,15 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
- int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src, src_stride, x_offset, \
- y_offset, dst, dst_stride, \
- - sec, w, h, &sse); \
- + sec, w, h, &sse, NULL, NULL); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16, src_stride, \
- x_offset, y_offset, \
- dst + 16, dst_stride, \
- - sec + 16, w, h, &sse2); \
- + sec + 16, w, h, &sse2, \
- + NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- @@ -490,14 +500,16 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
- src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- - sec + 32, w, h, &sse2); \
- + sec + 32, w, h, &sse2, \
- + NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48, src_stride, \
- x_offset, y_offset, \
- dst + 48, dst_stride, \
- - sec + 48, w, h, &sse2); \
- + sec + 48, w, h, &sse2, \
- + NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- @@ -525,7 +537,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
- int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + (start_row * dst_stride), dst_stride, \
- - sec + (start_row * w), w, height, &sse2); \
- + sec + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf) { \
- @@ -533,7 +545,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
- src + 16 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, \
- dst + 16 + (start_row * dst_stride), dst_stride, \
- - sec + 16 + (start_row * w), w, height, &sse2); \
- + sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf * 2) { \
- @@ -541,14 +553,14 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
- src + 32 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, \
- dst + 32 + (start_row * dst_stride), dst_stride, \
- - sec + 32 + (start_row * w), w, height, &sse2); \
- + sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, \
- dst + 48 + (start_row * dst_stride), dst_stride, \
- - sec + 48 + (start_row * w), w, height, &sse2); \
- + sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- } \
- --
- 2.7.0
|