WebSVN – Kolibri OS – Path Comparison – / – /programs/develop/libraries/Mesa/src/mesa/x86/read_rgba_span_x86.S Rev 1900 and /programs/develop/libraries/Mesa/src/mesa/x86/read_rgba_span

Regard whitespace Rev 1900 → Rev 1901

 /programs/develop/libraries/Mesa/src/mesa/x86/read_rgba_span_x86.S
 ,0 → 1,678
+/*
+ * (C) Copyright IBM Corporation 2004
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+/**
+ * \file read_rgba_span_x86.S
+ * Optimized routines to transfer pixel data from the framebuffer to a
+ * buffer in main memory.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+        .file   "read_rgba_span_x86.S"
+#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
+/* Kevin F. Quinn 2nd July 2006
+ * Replaced data segment constants with text-segment instructions.
+ */
+#define LOAD_MASK(mvins,m1,m2) \
+        pushl   $0xff00ff00 ;\
+        pushl   $0xff00ff00 ;\
+        pushl   $0xff00ff00 ;\
+        pushl   $0xff00ff00 ;\
+        mvins   (%esp), m1      ;\
+        pushl   $0x00ff0000 ;\
+        pushl   $0x00ff0000 ;\
+        pushl   $0x00ff0000 ;\
+        pushl   $0x00ff0000 ;\
+        mvins   (%esp), m2      ;\
+        addl    $32, %esp
+/* I implemented these as macros because they appear in several places,
+ * and I've tweaked them a number of times.  I got tired of changing every
+ * place they appear. :)
+ */
+#define DO_ONE_PIXEL() \
+        movl    (%ebx), %eax ; \
+        addl    $4, %ebx ; \
+        bswap   %eax          /* ARGB -> BGRA */ ; \
+        rorl    $8, %eax      /* BGRA -> ABGR */ ; \
+        movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
+        addl    $4, %ecx
+#define DO_ONE_LAST_PIXEL() \
+        movl    (%ebx), %eax ; \
+        bswap   %eax          /* ARGB -> BGRA */ ; \
+        rorl    $8, %eax      /* BGRA -> ABGR */ ; \
+        movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
+/**
+ * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
+ *
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
+.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
+        .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
+_generic_read_RGBA_span_BGRA8888_REV_MMX:
+        pushl   %ebx
+#ifdef USE_INNER_EMMS
+        emms
+#endif
+        LOAD_MASK(movq,%mm1,%mm2)
+        movl    8(%esp), %ebx   /* source pointer */
+        movl    16(%esp), %edx  /* number of pixels to copy */
+        movl    12(%esp), %ecx  /* destination pointer */
+        testl   %edx, %edx
+        jle     .L20            /* Bail if there's nothing to do. */
+        movl    %ebx, %eax
+        negl    %eax
+        sarl    $2, %eax
+        andl    $1, %eax
+        je      .L17
+        subl    %eax, %edx
+        DO_ONE_PIXEL()
+.L17:
+        /* Would it be faster to unroll this loop once and process 4 pixels
+         * per pass, instead of just two?
+         */
+        movl    %edx, %eax
+        shrl    %eax
+        jmp     .L18
+.L19:
+        movq    (%ebx), %mm0
+        addl    $8, %ebx
+        /* These 9 instructions do what PSHUFB (if there were such an
+         * instruction) could do in 1. :(
+         */
+        movq    %mm0, %mm3
+        movq    %mm0, %mm4
+        pand    %mm2, %mm3
+        psllq   $16, %mm4
+        psrlq   $16, %mm3
+        pand    %mm2, %mm4
+        pand    %mm1, %mm0
+        por     %mm4, %mm3
+        por     %mm3, %mm0
+        movq    %mm0, (%ecx)
+        addl    $8, %ecx
+        subl    $1, %eax
+.L18:
+        jne     .L19
+#ifdef USE_INNER_EMMS
+        emms
+#endif
+        /* At this point there are either 1 or 0 pixels remaining to be
+         * converted.  Convert the last pixel, if needed.
+         */
+        testl   $1, %edx
+        je      .L20
+        DO_ONE_LAST_PIXEL()
+.L20:
+        popl    %ebx
+        ret
+        .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
+/**
+ * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
+ * instructions are only actually used to read data from the framebuffer.
+ * In practice, the speed-up is pretty small.
+ *
+ * \todo
+ * Do some more testing and determine if there's any reason to have this
+ * function in addition to the MMX version.
+ *
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
+        .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE:
+        pushl   %esi
+        pushl   %ebx
+        pushl   %ebp
+#ifdef USE_INNER_EMMS
+        emms
+#endif
+        LOAD_MASK(movq,%mm1,%mm2)
+        movl    16(%esp), %ebx  /* source pointer */
+        movl    24(%esp), %edx  /* number of pixels to copy */
+        movl    20(%esp), %ecx  /* destination pointer */
+        testl   %edx, %edx
+        jle     .L35            /* Bail if there's nothing to do. */
+        movl    %esp, %ebp
+        subl    $16, %esp
+        andl    $0xfffffff0, %esp
+        movl    %ebx, %eax
+        movl    %edx, %esi
+        negl    %eax
+        andl    $15, %eax
+        sarl    $2, %eax
+        cmpl    %edx, %eax
+        cmovle  %eax, %esi
+        subl    %esi, %edx
+        testl   $1, %esi
+        je      .L32
+        DO_ONE_PIXEL()
+.L32:
+        testl   $2, %esi
+        je      .L31
+        movq    (%ebx), %mm0
+        addl    $8, %ebx
+        movq    %mm0, %mm3
+        movq    %mm0, %mm4
+        pand    %mm2, %mm3
+        psllq   $16, %mm4
+        psrlq   $16, %mm3
+        pand    %mm2, %mm4
+        pand    %mm1, %mm0
+        por     %mm4, %mm3
+        por     %mm3, %mm0
+        movq    %mm0, (%ecx)
+        addl    $8, %ecx
+.L31:
+        movl    %edx, %eax
+        shrl    $2, %eax
+        jmp     .L33
+.L34:
+        movaps  (%ebx), %xmm0
+        addl    $16, %ebx
+        /* This would be so much better if we could just move directly from
+         * an SSE register to an MMX register.  Unfortunately, that
+         * functionality wasn't introduced until SSE2 with the MOVDQ2Q
+         * instruction.
+         */
+        movaps  %xmm0, (%esp)
+        movq    (%esp), %mm0
+        movq    8(%esp), %mm5
+        movq    %mm0, %mm3
+        movq    %mm0, %mm4
+        movq    %mm5, %mm6
+        movq    %mm5, %mm7
+        pand    %mm2, %mm3
+        pand    %mm2, %mm6
+        psllq   $16, %mm4
+        psllq   $16, %mm7
+        psrlq   $16, %mm3
+        psrlq   $16, %mm6
+        pand    %mm2, %mm4
+        pand    %mm2, %mm7
+        pand    %mm1, %mm0
+        pand    %mm1, %mm5
+        por     %mm4, %mm3
+        por     %mm7, %mm6
+        por     %mm3, %mm0
+        por     %mm6, %mm5
+        movq    %mm0, (%ecx)
+        movq    %mm5, 8(%ecx)
+        addl    $16, %ecx
+        subl    $1, %eax
+.L33:
+        jne     .L34
+#ifdef USE_INNER_EMMS
+        emms
+#endif
+        movl    %ebp, %esp
+        /* At this point there are either [0, 3] pixels remaining to be
+         * converted.
+         */
+        testl   $2, %edx
+        je      .L36
+        movq    (%ebx), %mm0
+        addl    $8, %ebx
+        movq    %mm0, %mm3
+        movq    %mm0, %mm4
+        pand    %mm2, %mm3
+        psllq   $16, %mm4
+        psrlq   $16, %mm3
+        pand    %mm2, %mm4
+        pand    %mm1, %mm0
+        por     %mm4, %mm3
+        por     %mm3, %mm0
+        movq    %mm0, (%ecx)
+        addl    $8, %ecx
+.L36:
+        testl   $1, %edx
+        je      .L35
+        DO_ONE_LAST_PIXEL()
+.L35:
+        popl    %ebp
+        popl    %ebx
+        popl    %esi
+        ret
+        .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
+/**
+ * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
+ */
+        .text
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
+        .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE2:
+        pushl   %esi
+        pushl   %ebx
+        LOAD_MASK(movdqu,%xmm1,%xmm2)
+        movl    12(%esp), %ebx  /* source pointer */
+        movl    20(%esp), %edx  /* number of pixels to copy */
+        movl    16(%esp), %ecx  /* destination pointer */
+        movl    %ebx, %eax
+        movl    %edx, %esi
+        testl   %edx, %edx
+        jle     .L46            /* Bail if there's nothing to do. */
+        /* If the source pointer isn't a multiple of 16 we have to process
+         * a few pixels the "slow" way to get the address aligned for
+         * the SSE fetch intsructions.
+         */
+        negl    %eax
+        andl    $15, %eax
+        sarl    $2, %eax
+        cmpl    %edx, %eax
+        cmovbe  %eax, %esi
+        subl    %esi, %edx
+        testl   $1, %esi
+        je      .L41
+        DO_ONE_PIXEL()
+.L41:
+        testl   $2, %esi
+        je      .L40
+        movq    (%ebx), %xmm0
+        addl    $8, %ebx
+        movdqa  %xmm0, %xmm3
+        movdqa  %xmm0, %xmm4
+        andps   %xmm1, %xmm0
+        andps   %xmm2, %xmm3
+        pslldq  $2, %xmm4
+        psrldq  $2, %xmm3
+        andps   %xmm2, %xmm4
+        orps    %xmm4, %xmm3
+        orps    %xmm3, %xmm0
+        movq    %xmm0, (%ecx)
+        addl    $8, %ecx
+.L40:
+        /* Would it be worth having a specialized version of this loop for
+         * the case where the destination is 16-byte aligned?  That version
+         * would be identical except that it could use movedqa instead of
+         * movdqu.
+         */
+        movl    %edx, %eax
+        shrl    $2, %eax
+        jmp     .L42
+.L43:
+        movdqa  (%ebx), %xmm0
+        addl    $16, %ebx
+        movdqa  %xmm0, %xmm3
+        movdqa  %xmm0, %xmm4
+        andps   %xmm1, %xmm0
+        andps   %xmm2, %xmm3
+        pslldq  $2, %xmm4
+        psrldq  $2, %xmm3
+        andps   %xmm2, %xmm4
+        orps    %xmm4, %xmm3
+        orps    %xmm3, %xmm0
+        movdqu  %xmm0, (%ecx)
+        addl    $16, %ecx
+        subl    $1, %eax
+.L42:
+        jne     .L43
+        /* There may be upto 3 pixels remaining to be copied.  Take care
+         * of them now.  We do the 2 pixel case first because the data
+         * will be aligned.
+         */
+        testl   $2, %edx
+        je      .L47
+        movq    (%ebx), %xmm0
+        addl    $8, %ebx
+        movdqa  %xmm0, %xmm3
+        movdqa  %xmm0, %xmm4
+        andps   %xmm1, %xmm0
+        andps   %xmm2, %xmm3
+        pslldq  $2, %xmm4
+        psrldq  $2, %xmm3
+        andps   %xmm2, %xmm4
+        orps    %xmm4, %xmm3
+        orps    %xmm3, %xmm0
+        movq    %xmm0, (%ecx)
+        addl    $8, %ecx
+.L47:
+        testl   $1, %edx
+        je      .L46
+        DO_ONE_LAST_PIXEL()
+.L46:
+        popl    %ebx
+        popl    %esi
+        ret
+        .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
+#define MASK_565_L      0x07e0f800
+#define MASK_565_H      0x0000001f
+/* Setting SCALE_ADJUST to 5 gives a perfect match with the
+ * classic C implementation in Mesa.  Setting SCALE_ADJUST
+ * to 0 is slightly faster but at a small cost to accuracy.
+ */
+#define SCALE_ADJUST    5
+#if SCALE_ADJUST == 5
+#define PRESCALE_L 0x00100001
+#define PRESCALE_H 0x00000200
+#define SCALE_L 0x40C620E8
+#define SCALE_H 0x0000839d
+#elif SCALE_ADJUST == 0
+#define PRESCALE_L 0x00200001
+#define PRESCALE_H 0x00000800
+#define SCALE_L 0x01040108
+#define SCALE_H 0x00000108
+#else
+#error SCALE_ADJUST must either be 5 or 0.
+#endif
+#define ALPHA_L 0x00000000
+#define ALPHA_H 0x00ff0000
+/**
+ * MMX optimized version of the RGB565 to RGBA copy routine.
+ */
+        .text
+        .globl  _generic_read_RGBA_span_RGB565_MMX
+        .hidden _generic_read_RGBA_span_RGB565_MMX
+        .type   _generic_read_RGBA_span_RGB565_MMX, @function
+_generic_read_RGBA_span_RGB565_MMX:
+#ifdef USE_INNER_EMMS
+        emms
+#endif
+        movl    4(%esp), %eax   /* source pointer */
+        movl    8(%esp), %edx   /* destination pointer */
+        movl    12(%esp), %ecx  /* number of pixels to copy */
+        pushl   $MASK_565_H
+        pushl   $MASK_565_L
+        movq    (%esp), %mm5
+        pushl   $PRESCALE_H
+        pushl   $PRESCALE_L
+        movq    (%esp), %mm6
+        pushl   $SCALE_H
+        pushl   $SCALE_L
+        movq    (%esp), %mm7
+        pushl   $ALPHA_H
+        pushl   $ALPHA_L
+        movq    (%esp), %mm3
+        addl    $32,%esp
+        sarl    $2, %ecx
+        jl      .L01            /* Bail early if the count is negative. */
+        jmp     .L02
+.L03:
+        /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
+         * second pixels into the four words of %mm0 and %mm2.
+         */
+        movq    (%eax), %mm4
+        addl    $8, %eax
+        pshufw  $0x00, %mm4, %mm0
+        pshufw  $0x55, %mm4, %mm2
+        /* Mask the pixels so that each word of each register contains only
+         * one color component.
+         */
+        pand    %mm5, %mm0
+        pand    %mm5, %mm2
+        /* Adjust the component values so that they are as small as possible,
+         * but large enough so that we can multiply them by an unsigned 16-bit
+         * number and get a value as large as 0x00ff0000.
+         */
+        pmullw  %mm6, %mm0
+        pmullw  %mm6, %mm2
+#if SCALE_ADJUST > 0
+        psrlw   $SCALE_ADJUST, %mm0
+        psrlw   $SCALE_ADJUST, %mm2
+#endif
+        /* Scale the input component values to be on the range
+         * [0, 0x00ff0000].  This it the real magic of the whole routine.
+         */
+        pmulhuw %mm7, %mm0
+        pmulhuw %mm7, %mm2
+        /* Always set the alpha value to 0xff.
+         */
+        por %mm3, %mm0
+        por %mm3, %mm2
+        /* Pack the 16-bit values to 8-bit values and store the converted
+         * pixel data.
+         */
+        packuswb        %mm2, %mm0
+        movq    %mm0, (%edx)
+        addl    $8, %edx
+        pshufw  $0xaa, %mm4, %mm0
+        pshufw  $0xff, %mm4, %mm2
+        pand    %mm5, %mm0
+        pand    %mm5, %mm2
+        pmullw  %mm6, %mm0
+        pmullw  %mm6, %mm2
+#if SCALE_ADJUST > 0
+        psrlw   $SCALE_ADJUST, %mm0
+        psrlw   $SCALE_ADJUST, %mm2
+#endif
+        pmulhuw %mm7, %mm0
+        pmulhuw %mm7, %mm2
+        por %mm3, %mm0
+        por %mm3, %mm2
+        packuswb        %mm2, %mm0
+        movq    %mm0, (%edx)
+        addl    $8, %edx
+        subl    $1, %ecx
+.L02:
+        jne     .L03
+        /* At this point there can be at most 3 pixels left to process.  If
+         * there is either 2 or 3 left, process 2.
+         */
+        movl    12(%esp), %ecx
+        testl   $0x02, %ecx
+        je      .L04
+        movd    (%eax), %mm4
+        addl    $4, %eax
+        pshufw  $0x00, %mm4, %mm0
+        pshufw  $0x55, %mm4, %mm2
+        pand    %mm5, %mm0
+        pand    %mm5, %mm2
+        pmullw  %mm6, %mm0
+        pmullw  %mm6, %mm2
+#if SCALE_ADJUST > 0
+        psrlw   $SCALE_ADJUST, %mm0
+        psrlw   $SCALE_ADJUST, %mm2
+#endif
+        pmulhuw %mm7, %mm0
+        pmulhuw %mm7, %mm2
+        por %mm3, %mm0
+        por %mm3, %mm2
+        packuswb        %mm2, %mm0
+        movq    %mm0, (%edx)
+        addl    $8, %edx
+.L04:
+        /* At this point there can be at most 1 pixel left to process.
+         * Process it if needed.
+         */
+        testl   $0x01, %ecx
+        je      .L01
+        movzwl  (%eax), %ecx
+        movd    %ecx, %mm4
+        pshufw  $0x00, %mm4, %mm0
+        pand    %mm5, %mm0
+        pmullw  %mm6, %mm0
+#if SCALE_ADJUST > 0
+        psrlw   $SCALE_ADJUST, %mm0
+#endif
+        pmulhuw %mm7, %mm0
+        por %mm3, %mm0
+        packuswb        %mm0, %mm0
+        movd    %mm0, (%edx)
+.L01:
+#ifdef USE_INNER_EMMS
+        emms
+#endif
+        ret
+#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
+#if defined (__ELF__) && defined (__linux__)
+        .section .note.GNU-stack,"",%progbits
+#endif

Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 1900 → Rev 1901