123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502 |
- /********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
- function:
- last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
- ********************************************************************/
- #include <stddef.h>
- #include "x86enc.h"
- #include "sse2trans.h"
- #if defined(OC_X86_ASM)
- /*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
- 16-bit differences.
- On output, these are stored in _m0, xmm1, xmm2, and xmm3.
- xmm4 and xmm5 are clobbered.*/
- #define OC_LOAD_SUB_4x8(_m0) \
- "#OC_LOAD_SUB_4x8\n\t" \
- /*Load the first three rows.*/ \
- "movq (%[src]),"_m0"\n\t" \
- "movq (%[ref]),%%xmm4\n\t" \
- "movq (%[src],%[ystride]),%%xmm1\n\t" \
- "movq (%[ref],%[ystride]),%%xmm3\n\t" \
- "movq (%[src],%[ystride],2),%%xmm2\n\t" \
- "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
- /*Unpack and subtract.*/ \
- "punpcklbw %%xmm4,"_m0"\n\t" \
- "punpcklbw %%xmm4,%%xmm4\n\t" \
- "punpcklbw %%xmm3,%%xmm1\n\t" \
- "punpcklbw %%xmm3,%%xmm3\n\t" \
- "psubw %%xmm4,"_m0"\n\t" \
- "psubw %%xmm3,%%xmm1\n\t" \
- /*Load the last row.*/ \
- "movq (%[src],%[ystride3]),%%xmm3\n\t" \
- "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
- /*Unpack, subtract, and advance the pointers.*/ \
- "punpcklbw %%xmm5,%%xmm2\n\t" \
- "punpcklbw %%xmm5,%%xmm5\n\t" \
- "lea (%[src],%[ystride],4),%[src]\n\t" \
- "psubw %%xmm5,%%xmm2\n\t" \
- "punpcklbw %%xmm4,%%xmm3\n\t" \
- "punpcklbw %%xmm4,%%xmm4\n\t" \
- "lea (%[ref],%[ystride],4),%[ref]\n\t" \
- "psubw %%xmm4,%%xmm3\n\t" \
- /*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
- On output, xmm0 contains the sum of two of the rows, and the other two are
- added to xmm7.*/
- #define OC_SSD_4x8(_m0) \
- "pmaddwd "_m0","_m0"\n\t" \
- "pmaddwd %%xmm1,%%xmm1\n\t" \
- "pmaddwd %%xmm2,%%xmm2\n\t" \
- "pmaddwd %%xmm3,%%xmm3\n\t" \
- "paddd %%xmm1,"_m0"\n\t" \
- "paddd %%xmm3,%%xmm2\n\t" \
- "paddd %%xmm2,%%xmm7\n\t" \
- unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
- const unsigned char *_ref,int _ystride){
- unsigned ret;
- __asm__ __volatile__(
- OC_LOAD_SUB_4x8("%%xmm7")
- OC_SSD_4x8("%%xmm7")
- OC_LOAD_SUB_4x8("%%xmm0")
- OC_SSD_4x8("%%xmm0")
- "paddd %%xmm0,%%xmm7\n\t"
- "movdqa %%xmm7,%%xmm6\n\t"
- "punpckhqdq %%xmm7,%%xmm7\n\t"
- "paddd %%xmm6,%%xmm7\n\t"
- "pshufd $1,%%xmm7,%%xmm6\n\t"
- "paddd %%xmm6,%%xmm7\n\t"
- "movd %%xmm7,%[ret]\n\t"
- :[ret]"=a"(ret)
- :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
- [ystride3]"r"((ptrdiff_t)_ystride*3)
- );
- return ret;
- }
- static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
- 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
- };
- /*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
- horizontal sums as well as their 16-bit differences subject to a mask.
- %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
- #define OC_LOAD_SUB_MASK_2x8 \
- "#OC_LOAD_SUB_MASK_2x8\n\t" \
- /*Start the loads and expand the next 8 bits of the mask.*/ \
- "shl $8,%[m]\n\t" \
- "movq (%[src]),%%xmm0\n\t" \
- "mov %h[m],%b[m]\n\t" \
- "movq (%[ref]),%%xmm2\n\t" \
- "movd %[m],%%xmm4\n\t" \
- "shr $8,%[m]\n\t" \
- "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
- "mov %h[m],%b[m]\n\t" \
- "pand %%xmm6,%%xmm4\n\t" \
- "pcmpeqb %%xmm6,%%xmm4\n\t" \
- /*Perform the masking.*/ \
- "pand %%xmm4,%%xmm0\n\t" \
- "pand %%xmm4,%%xmm2\n\t" \
- /*Finish the loads while unpacking the first set of rows, and expand the next
- 8 bits of the mask.*/ \
- "movd %[m],%%xmm4\n\t" \
- "movq (%[src],%[ystride]),%%xmm1\n\t" \
- "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
- "movq (%[ref],%[ystride]),%%xmm3\n\t" \
- "pand %%xmm6,%%xmm4\n\t" \
- "punpcklbw %%xmm2,%%xmm0\n\t" \
- "pcmpeqb %%xmm6,%%xmm4\n\t" \
- "punpcklbw %%xmm2,%%xmm2\n\t" \
- /*Mask and unpack the second set of rows.*/ \
- "pand %%xmm4,%%xmm1\n\t" \
- "pand %%xmm4,%%xmm3\n\t" \
- "punpcklbw %%xmm3,%%xmm1\n\t" \
- "punpcklbw %%xmm3,%%xmm3\n\t" \
- "psubw %%xmm2,%%xmm0\n\t" \
- "psubw %%xmm3,%%xmm1\n\t" \
- unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
- ptrdiff_t ystride;
- unsigned ret;
- int i;
- ystride=_ystride;
- __asm__ __volatile__(
- "pxor %%xmm7,%%xmm7\n\t"
- "movq %[c],%%xmm6\n\t"
- :
- :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
- );
- for(i=0;i<4;i++){
- unsigned m;
- m=_mask&0xFFFF;
- _mask>>=16;
- if(m){
- __asm__ __volatile__(
- OC_LOAD_SUB_MASK_2x8
- "pmaddwd %%xmm0,%%xmm0\n\t"
- "pmaddwd %%xmm1,%%xmm1\n\t"
- "paddd %%xmm0,%%xmm7\n\t"
- "paddd %%xmm1,%%xmm7\n\t"
- :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
- );
- }
- _src+=2*ystride;
- _ref+=2*ystride;
- }
- __asm__ __volatile__(
- "movdqa %%xmm7,%%xmm6\n\t"
- "punpckhqdq %%xmm7,%%xmm7\n\t"
- "paddd %%xmm6,%%xmm7\n\t"
- "pshufd $1,%%xmm7,%%xmm6\n\t"
- "paddd %%xmm6,%%xmm7\n\t"
- "movd %%xmm7,%[ret]\n\t"
- :[ret]"=a"(ret)
- );
- return ret;
- }
- /*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
- 16-bit difference in %%xmm0...%%xmm7.*/
- #define OC_LOAD_SUB_8x8 \
- "#OC_LOAD_SUB_8x8\n\t" \
- "movq (%[src]),%%xmm0\n\t" \
- "movq (%[ref]),%%xmm4\n\t" \
- "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
- "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
- "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "movq (%[src]),%%xmm2\n\t" \
- "movq (%[ref]),%%xmm7\n\t" \
- "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
- "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
- "punpcklbw %%xmm4,%%xmm0\n\t" \
- "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "punpcklbw %%xmm4,%%xmm4\n\t" \
- "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "psubw %%xmm4,%%xmm0\n\t" \
- "movq (%[src]),%%xmm4\n\t" \
- "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
- "movq (%[ref]),%%xmm0\n\t" \
- "punpcklbw %%xmm5,%%xmm1\n\t" \
- "punpcklbw %%xmm5,%%xmm5\n\t" \
- "psubw %%xmm5,%%xmm1\n\t" \
- "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
- "punpcklbw %%xmm7,%%xmm2\n\t" \
- "punpcklbw %%xmm7,%%xmm7\n\t" \
- "psubw %%xmm7,%%xmm2\n\t" \
- "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
- "punpcklbw %%xmm6,%%xmm3\n\t" \
- "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "punpcklbw %%xmm6,%%xmm6\n\t" \
- "psubw %%xmm6,%%xmm3\n\t" \
- "movq (%[src]),%%xmm6\n\t" \
- "punpcklbw %%xmm0,%%xmm4\n\t" \
- "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "punpcklbw %%xmm0,%%xmm0\n\t" \
- "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "psubw %%xmm0,%%xmm4\n\t" \
- "movq (%[ref]),%%xmm0\n\t" \
- "punpcklbw %%xmm7,%%xmm5\n\t" \
- "neg %[src_ystride]\n\t" \
- "punpcklbw %%xmm7,%%xmm7\n\t" \
- "psubw %%xmm7,%%xmm5\n\t" \
- "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
- "punpcklbw %%xmm0,%%xmm6\n\t" \
- "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "punpcklbw %%xmm0,%%xmm0\n\t" \
- "neg %[ref_ystride]\n\t" \
- "psubw %%xmm0,%%xmm6\n\t" \
- "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
- "punpcklbw %%xmm0,%%xmm7\n\t" \
- "punpcklbw %%xmm0,%%xmm0\n\t" \
- "psubw %%xmm0,%%xmm7\n\t" \
- "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
- /*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
- #define OC_LOAD_8x8 \
- "#OC_LOAD_8x8\n\t" \
- "movq (%[src]),%%xmm0\n\t" \
- "movq (%[src],%[ystride]),%%xmm1\n\t" \
- "movq (%[src],%[ystride],2),%%xmm2\n\t" \
- "pxor %%xmm7,%%xmm7\n\t" \
- "movq (%[src],%[ystride3]),%%xmm3\n\t" \
- "punpcklbw %%xmm7,%%xmm0\n\t" \
- "movq (%[src4]),%%xmm4\n\t" \
- "punpcklbw %%xmm7,%%xmm1\n\t" \
- "movq (%[src4],%[ystride]),%%xmm5\n\t" \
- "punpcklbw %%xmm7,%%xmm2\n\t" \
- "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
- "punpcklbw %%xmm7,%%xmm3\n\t" \
- "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
- "punpcklbw %%xmm4,%%xmm4\n\t" \
- "punpcklbw %%xmm5,%%xmm5\n\t" \
- "psrlw $8,%%xmm4\n\t" \
- "psrlw $8,%%xmm5\n\t" \
- "punpcklbw %%xmm6,%%xmm6\n\t" \
- "punpcklbw %%xmm7,%%xmm7\n\t" \
- "psrlw $8,%%xmm6\n\t" \
- "psrlw $8,%%xmm7\n\t" \
- /*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
- Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
- perform this stage in place with no temporary registers).*/
- #define OC_HADAMARD_AB_8x8 \
- "#OC_HADAMARD_AB_8x8\n\t" \
- /*Stage A:*/ \
- "paddw %%xmm5,%%xmm1\n\t" \
- "paddw %%xmm6,%%xmm2\n\t" \
- "paddw %%xmm5,%%xmm5\n\t" \
- "paddw %%xmm6,%%xmm6\n\t" \
- "psubw %%xmm1,%%xmm5\n\t" \
- "psubw %%xmm2,%%xmm6\n\t" \
- "paddw %%xmm7,%%xmm3\n\t" \
- "paddw %%xmm4,%%xmm0\n\t" \
- "paddw %%xmm7,%%xmm7\n\t" \
- "paddw %%xmm4,%%xmm4\n\t" \
- "psubw %%xmm3,%%xmm7\n\t" \
- "psubw %%xmm0,%%xmm4\n\t" \
- /*Stage B:*/ \
- "paddw %%xmm2,%%xmm0\n\t" \
- "paddw %%xmm3,%%xmm1\n\t" \
- "paddw %%xmm6,%%xmm4\n\t" \
- "paddw %%xmm7,%%xmm5\n\t" \
- "paddw %%xmm2,%%xmm2\n\t" \
- "paddw %%xmm3,%%xmm3\n\t" \
- "paddw %%xmm6,%%xmm6\n\t" \
- "paddw %%xmm7,%%xmm7\n\t" \
- "psubw %%xmm0,%%xmm2\n\t" \
- "psubw %%xmm1,%%xmm3\n\t" \
- "psubw %%xmm4,%%xmm6\n\t" \
- "psubw %%xmm5,%%xmm7\n\t" \
- /*Performs the last stage of an 8-point 1-D Hadamard transform in place.
- Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
- place with no temporary registers).*/
- #define OC_HADAMARD_C_8x8 \
- "#OC_HADAMARD_C_8x8\n\t" \
- /*Stage C:*/ \
- "paddw %%xmm1,%%xmm0\n\t" \
- "paddw %%xmm3,%%xmm2\n\t" \
- "paddw %%xmm5,%%xmm4\n\t" \
- "paddw %%xmm7,%%xmm6\n\t" \
- "paddw %%xmm1,%%xmm1\n\t" \
- "paddw %%xmm3,%%xmm3\n\t" \
- "paddw %%xmm5,%%xmm5\n\t" \
- "paddw %%xmm7,%%xmm7\n\t" \
- "psubw %%xmm0,%%xmm1\n\t" \
- "psubw %%xmm2,%%xmm3\n\t" \
- "psubw %%xmm4,%%xmm5\n\t" \
- "psubw %%xmm6,%%xmm7\n\t" \
- /*Performs an 8-point 1-D Hadamard transform in place.
- Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
- in place with no temporary registers).*/
- #define OC_HADAMARD_8x8 \
- OC_HADAMARD_AB_8x8 \
- OC_HADAMARD_C_8x8 \
- /*Performs the first part of the final stage of the Hadamard transform and
- summing of absolute values.
- At the end of this part, %%xmm1 will contain the DC coefficient of the
- transform.*/
- #define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
- /*We use the fact that \
- (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
- to merge the final butterfly with the abs and the first stage of \
- accumulation. \
- Thus we can avoid using pabsw, which is not available until SSSE3. \
- Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
- implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
- registers). \
- Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
- This implementation is only 26 (+4 for spilling registers).*/ \
- "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
- "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
- "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
- /*xmm7={0x7FFF}x4 \
- xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
- "pcmpeqb %%xmm7,%%xmm7\n\t" \
- "movdqa %%xmm4,%%xmm6\n\t" \
- "psrlw $1,%%xmm7\n\t" \
- "paddw %%xmm5,%%xmm6\n\t" \
- "pmaxsw %%xmm5,%%xmm4\n\t" \
- "paddsw %%xmm7,%%xmm6\n\t" \
- "psubw %%xmm6,%%xmm4\n\t" \
- /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
- xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
- "movdqa %%xmm2,%%xmm6\n\t" \
- "movdqa %%xmm0,%%xmm5\n\t" \
- "pmaxsw %%xmm3,%%xmm2\n\t" \
- "pmaxsw %%xmm1,%%xmm0\n\t" \
- "paddw %%xmm3,%%xmm6\n\t" \
- "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
- "paddw %%xmm5,%%xmm1\n\t" \
- "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
- /*Performs the second part of the final stage of the Hadamard transform and
- summing of absolute values.*/
- #define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
- "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
- "paddsw %%xmm7,%%xmm6\n\t" \
- "paddsw %%xmm7,%%xmm1\n\t" \
- "psubw %%xmm6,%%xmm2\n\t" \
- "psubw %%xmm1,%%xmm0\n\t" \
- /*xmm7={1}x4 (needed for the horizontal add that follows) \
- xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
- "movdqa %%xmm3,%%xmm6\n\t" \
- "pmaxsw %%xmm5,%%xmm3\n\t" \
- "paddw %%xmm2,%%xmm0\n\t" \
- "paddw %%xmm5,%%xmm6\n\t" \
- "paddw %%xmm4,%%xmm0\n\t" \
- "paddsw %%xmm7,%%xmm6\n\t" \
- "paddw %%xmm3,%%xmm0\n\t" \
- "psrlw $14,%%xmm7\n\t" \
- "psubw %%xmm6,%%xmm0\n\t" \
- /*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
- absolute value of each component, and accumulates everything into xmm0.*/
- #define OC_HADAMARD_C_ABS_ACCUM_8x8 \
- OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
- OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
- /*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
- component, and accumulates everything into xmm0.
- Note that xmm0 will have an extra 4 added to each column, and that after
- removing this value, the remainder will be half the conventional value.*/
- #define OC_HADAMARD_ABS_ACCUM_8x8 \
- OC_HADAMARD_AB_8x8 \
- OC_HADAMARD_C_ABS_ACCUM_8x8
- static unsigned oc_int_frag_satd_sse2(int *_dc,
- const unsigned char *_src,int _src_ystride,
- const unsigned char *_ref,int _ref_ystride){
- OC_ALIGN16(ogg_int16_t buf[16]);
- unsigned ret;
- unsigned ret2;
- int dc;
- __asm__ __volatile__(
- OC_LOAD_SUB_8x8
- OC_HADAMARD_8x8
- OC_TRANSPOSE_8x8
- /*We split out the stages here so we can save the DC coefficient in the
- middle.*/
- OC_HADAMARD_AB_8x8
- OC_HADAMARD_C_ABS_ACCUM_A_8x8
- "movd %%xmm1,%[dc]\n\t"
- OC_HADAMARD_C_ABS_ACCUM_B_8x8
- /*Up to this point, everything fit in 16 bits (8 input + 1 for the
- difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
- for the factor of two we dropped + 3 for the vertical accumulation).
- Now we finally have to promote things to dwords.
- We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
- latency of pmaddwd by starting to compute abs(dc) here.*/
- "pmaddwd %%xmm7,%%xmm0\n\t"
- "movsx %w[dc],%[dc]\n\t"
- "cdq\n\t"
- "movdqa %%xmm0,%%xmm1\n\t"
- "punpckhqdq %%xmm0,%%xmm0\n\t"
- "paddd %%xmm1,%%xmm0\n\t"
- "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
- "paddd %%xmm1,%%xmm0\n\t"
- "movd %%xmm0,%[ret]\n\t"
- /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
- added to them, a factor of two removed, and the DC value included;
- correct the final sum here.*/
- "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
- "xor %[dc],%[ret2]\n\t"
- "sub %[ret2],%[ret]\n\t"
- /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
- and %[dc] with some of the inputs, since for once we don't write to
- them until after we're done using everything but %[buf].*/
- /*Note that _src_ystride and _ref_ystride must be given non-overlapping
- constraints, otherewise if gcc can prove they're equal it will allocate
- them to the same register (which is bad); _src and _ref face a similar
- problem.
- All four are destructively modified, but if we list them as output
- constraints, gcc can't alias them with other outputs.*/
- :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
- [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
- :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
- [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
- /*We have to use neg, so we actually clobber the condition codes for once
- (not to mention sub, and add).*/
- :"cc"
- );
- *_dc=dc;
- return ret;
- }
- unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
- const unsigned char *_ref,int _ystride){
- return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
- }
- unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
- OC_ALIGN8(unsigned char ref[64]);
- oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
- return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
- }
- unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
- const unsigned char *_src,int _ystride){
- OC_ALIGN16(ogg_int16_t buf[16]);
- unsigned ret;
- int dc;
- __asm__ __volatile__(
- OC_LOAD_8x8
- OC_HADAMARD_8x8
- OC_TRANSPOSE_8x8
- /*We split out the stages here so we can save the DC coefficient in the
- middle.*/
- OC_HADAMARD_AB_8x8
- OC_HADAMARD_C_ABS_ACCUM_A_8x8
- "movd %%xmm1,%[dc]\n\t"
- OC_HADAMARD_C_ABS_ACCUM_B_8x8
- /*Up to this point, everything fit in 16 bits (8 input + 1 for the
- difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
- for the factor of two we dropped + 3 for the vertical accumulation).
- Now we finally have to promote things to dwords.*/
- "pmaddwd %%xmm7,%%xmm0\n\t"
- /*We assume that the DC coefficient is always positive (which is true,
- because the input to the INTRA transform was not a difference).*/
- "movzx %w[dc],%[dc]\n\t"
- "movdqa %%xmm0,%%xmm1\n\t"
- "punpckhqdq %%xmm0,%%xmm0\n\t"
- "paddd %%xmm1,%%xmm0\n\t"
- "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
- "paddd %%xmm1,%%xmm0\n\t"
- "movd %%xmm0,%[ret]\n\t"
- "lea -64(%[ret],%[ret]),%[ret]\n\t"
- "sub %[dc],%[ret]\n\t"
- /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
- and %[dc] with some of the inputs, since for once we don't write to
- them until after we're done using everything but %[buf].*/
- :[ret]"=a"(ret),[dc]"=r"(dc),
- [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
- :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
- [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
- /*We have to use sub, so we actually clobber the condition codes for once.*/
- :"cc"
- );
- *_dc=dc;
- return ret;
- }
- #endif
|