Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * (C) Copyright IBM Corporation 2004
  3.  * All Rights Reserved.
  4.  *
  5.  * Permission is hereby granted, free of charge, to any person obtaining a
  6.  * copy of this software and associated documentation files (the "Software"),
  7.  * to deal in the Software without restriction, including without limitation
  8.  * on the rights to use, copy, modify, merge, publish, distribute, sub
  9.  * license, and/or sell copies of the Software, and to permit persons to whom
  10.  * the Software is furnished to do so, subject to the following conditions:
  11.  *
  12.  * The above copyright notice and this permission notice (including the next
  13.  * paragraph) shall be included in all copies or substantial portions of the
  14.  * Software.
  15.  *
  16.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18.  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19.  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20.  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21.  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22.  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23.  */
  24.  
  25. /**
  26.  * \file read_rgba_span_x86.S
  27.  * Optimized routines to transfer pixel data from the framebuffer to a
  28.  * buffer in main memory.
  29.  *
  30.  * \author Ian Romanick <idr@us.ibm.com>
  31.  */
  32.  
  33.         .file   "read_rgba_span_x86.S"
  34. #if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
  35. /* Kevin F. Quinn 2nd July 2006
  36.  * Replaced data segment constants with text-segment instructions.
  37.  */
  38. #define LOAD_MASK(mvins,m1,m2) \
  39.         pushl   $0xff00ff00 ;\
  40.         pushl   $0xff00ff00 ;\
  41.         pushl   $0xff00ff00 ;\
  42.         pushl   $0xff00ff00 ;\
  43.         mvins   (%esp), m1      ;\
  44.         pushl   $0x00ff0000 ;\
  45.         pushl   $0x00ff0000 ;\
  46.         pushl   $0x00ff0000 ;\
  47.         pushl   $0x00ff0000 ;\
  48.         mvins   (%esp), m2      ;\
  49.         addl    $32, %esp
  50.  
  51. /* I implemented these as macros because they appear in several places,
  52.  * and I've tweaked them a number of times.  I got tired of changing every
  53.  * place they appear. :)
  54.  */
  55.  
  56. #define DO_ONE_PIXEL() \
  57.         movl    (%ebx), %eax ; \
  58.         addl    $4, %ebx ; \
  59.         bswap   %eax          /* ARGB -> BGRA */ ; \
  60.         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  61.         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  62.         addl    $4, %ecx
  63.  
  64. #define DO_ONE_LAST_PIXEL() \
  65.         movl    (%ebx), %eax ; \
  66.         bswap   %eax          /* ARGB -> BGRA */ ; \
  67.         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  68.         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  69.  
  70.  
  71. /**
  72.  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  73.  *
  74.  * \warning
  75.  * This function assumes that the caller will issue the EMMS instruction
  76.  * at the correct places.
  77.  */
  78.  
  79. .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  80. #ifndef USE_DRICORE
  81. .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
  82. #endif
  83.         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  84. _generic_read_RGBA_span_BGRA8888_REV_MMX:
  85.         pushl   %ebx
  86.  
  87. #ifdef USE_INNER_EMMS
  88.         emms
  89. #endif
  90.         LOAD_MASK(movq,%mm1,%mm2)
  91.  
  92.         movl    8(%esp), %ebx   /* source pointer */
  93.         movl    16(%esp), %edx  /* number of pixels to copy */
  94.         movl    12(%esp), %ecx  /* destination pointer */
  95.  
  96.         testl   %edx, %edx
  97.         jle     .L20            /* Bail if there's nothing to do. */
  98.  
  99.         movl    %ebx, %eax
  100.  
  101.         negl    %eax
  102.         sarl    $2, %eax
  103.         andl    $1, %eax
  104.         je      .L17
  105.  
  106.         subl    %eax, %edx
  107.         DO_ONE_PIXEL()
  108. .L17:
  109.  
  110.         /* Would it be faster to unroll this loop once and process 4 pixels
  111.          * per pass, instead of just two?
  112.          */
  113.  
  114.         movl    %edx, %eax
  115.         shrl    %eax
  116.         jmp     .L18
  117. .L19:
  118.         movq    (%ebx), %mm0
  119.         addl    $8, %ebx
  120.  
  121.         /* These 9 instructions do what PSHUFB (if there were such an
  122.          * instruction) could do in 1. :(
  123.          */
  124.  
  125.         movq    %mm0, %mm3
  126.         movq    %mm0, %mm4
  127.  
  128.         pand    %mm2, %mm3
  129.         psllq   $16, %mm4
  130.         psrlq   $16, %mm3
  131.         pand    %mm2, %mm4
  132.  
  133.         pand    %mm1, %mm0
  134.         por     %mm4, %mm3
  135.         por     %mm3, %mm0
  136.  
  137.         movq    %mm0, (%ecx)
  138.         addl    $8, %ecx
  139.         subl    $1, %eax
  140. .L18:
  141.         jne     .L19
  142.  
  143. #ifdef USE_INNER_EMMS
  144.         emms
  145. #endif
  146.  
  147.         /* At this point there are either 1 or 0 pixels remaining to be
  148.          * converted.  Convert the last pixel, if needed.
  149.          */
  150.  
  151.         testl   $1, %edx
  152.         je      .L20
  153.  
  154.         DO_ONE_LAST_PIXEL()
  155.  
  156. .L20:
  157.         popl    %ebx
  158.         ret
  159.         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
  160.  
  161.  
  162. /**
  163.  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
  164.  * instructions are only actually used to read data from the framebuffer.
  165.  * In practice, the speed-up is pretty small.
  166.  *
  167.  * \todo
  168.  * Do some more testing and determine if there's any reason to have this
  169.  * function in addition to the MMX version.
  170.  *
  171.  * \warning
  172.  * This function assumes that the caller will issue the EMMS instruction
  173.  * at the correct places.
  174.  */
  175.  
  176. .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
  177. #ifndef USE_DRICORE
  178. .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
  179. #endif
  180.         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
  181. _generic_read_RGBA_span_BGRA8888_REV_SSE:
  182.         pushl   %esi
  183.         pushl   %ebx
  184.         pushl   %ebp
  185.  
  186. #ifdef USE_INNER_EMMS
  187.         emms
  188. #endif
  189.  
  190.         LOAD_MASK(movq,%mm1,%mm2)
  191.  
  192.         movl    16(%esp), %ebx  /* source pointer */
  193.         movl    24(%esp), %edx  /* number of pixels to copy */
  194.         movl    20(%esp), %ecx  /* destination pointer */
  195.  
  196.         testl   %edx, %edx
  197.         jle     .L35            /* Bail if there's nothing to do. */
  198.  
  199.         movl    %esp, %ebp
  200.         subl    $16, %esp
  201.         andl    $0xfffffff0, %esp
  202.  
  203.         movl    %ebx, %eax
  204.         movl    %edx, %esi
  205.  
  206.         negl    %eax
  207.         andl    $15, %eax
  208.         sarl    $2, %eax
  209.         cmpl    %edx, %eax
  210.         cmovle  %eax, %esi
  211.  
  212.         subl    %esi, %edx
  213.  
  214.         testl   $1, %esi
  215.         je      .L32
  216.  
  217.         DO_ONE_PIXEL()
  218. .L32:
  219.  
  220.         testl   $2, %esi
  221.         je      .L31
  222.  
  223.         movq    (%ebx), %mm0
  224.         addl    $8, %ebx
  225.  
  226.         movq    %mm0, %mm3
  227.         movq    %mm0, %mm4
  228.        
  229.         pand    %mm2, %mm3
  230.         psllq   $16, %mm4
  231.         psrlq   $16, %mm3
  232.         pand    %mm2, %mm4
  233.  
  234.         pand    %mm1, %mm0
  235.         por     %mm4, %mm3
  236.         por     %mm3, %mm0
  237.  
  238.         movq    %mm0, (%ecx)
  239.         addl    $8, %ecx
  240. .L31:
  241.  
  242.         movl    %edx, %eax
  243.         shrl    $2, %eax
  244.         jmp     .L33
  245. .L34:
  246.         movaps  (%ebx), %xmm0
  247.         addl    $16, %ebx
  248.  
  249.         /* This would be so much better if we could just move directly from
  250.          * an SSE register to an MMX register.  Unfortunately, that
  251.          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
  252.          * instruction.
  253.          */
  254.  
  255.         movaps  %xmm0, (%esp)
  256.         movq    (%esp), %mm0
  257.         movq    8(%esp), %mm5
  258.  
  259.         movq    %mm0, %mm3
  260.         movq    %mm0, %mm4
  261.         movq    %mm5, %mm6
  262.         movq    %mm5, %mm7
  263.  
  264.         pand    %mm2, %mm3
  265.         pand    %mm2, %mm6
  266.  
  267.         psllq   $16, %mm4
  268.         psllq   $16, %mm7
  269.  
  270.         psrlq   $16, %mm3
  271.         psrlq   $16, %mm6
  272.  
  273.         pand    %mm2, %mm4
  274.         pand    %mm2, %mm7
  275.  
  276.         pand    %mm1, %mm0
  277.         pand    %mm1, %mm5
  278.  
  279.         por     %mm4, %mm3
  280.         por     %mm7, %mm6
  281.  
  282.         por     %mm3, %mm0
  283.         por     %mm6, %mm5
  284.  
  285.         movq    %mm0, (%ecx)
  286.         movq    %mm5, 8(%ecx)
  287.         addl    $16, %ecx
  288.  
  289.         subl    $1, %eax
  290. .L33:
  291.         jne     .L34
  292.  
  293. #ifdef USE_INNER_EMMS
  294.         emms
  295. #endif
  296.         movl    %ebp, %esp
  297.  
  298.         /* At this point there are either [0, 3] pixels remaining to be
  299.          * converted.
  300.          */
  301.  
  302.         testl   $2, %edx
  303.         je      .L36
  304.  
  305.         movq    (%ebx), %mm0
  306.         addl    $8, %ebx
  307.  
  308.         movq    %mm0, %mm3
  309.         movq    %mm0, %mm4
  310.        
  311.         pand    %mm2, %mm3
  312.         psllq   $16, %mm4
  313.         psrlq   $16, %mm3
  314.         pand    %mm2, %mm4
  315.  
  316.         pand    %mm1, %mm0
  317.         por     %mm4, %mm3
  318.         por     %mm3, %mm0
  319.  
  320.         movq    %mm0, (%ecx)
  321.         addl    $8, %ecx
  322. .L36:
  323.  
  324.         testl   $1, %edx
  325.         je      .L35
  326.  
  327.         DO_ONE_LAST_PIXEL()
  328. .L35:
  329.         popl    %ebp
  330.         popl    %ebx
  331.         popl    %esi
  332.         ret
  333.         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
  334.  
  335.  
  336. /**
  337.  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
  338.  */
  339.  
  340.         .text
  341. .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
  342. #ifndef USE_DRICORE
  343. .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
  344. #endif
  345.         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
  346. _generic_read_RGBA_span_BGRA8888_REV_SSE2:
  347.         pushl   %esi
  348.         pushl   %ebx
  349.  
  350.         LOAD_MASK(movdqu,%xmm1,%xmm2)
  351.  
  352.         movl    12(%esp), %ebx  /* source pointer */
  353.         movl    20(%esp), %edx  /* number of pixels to copy */
  354.         movl    16(%esp), %ecx  /* destination pointer */
  355.  
  356.         movl    %ebx, %eax
  357.         movl    %edx, %esi
  358.  
  359.         testl   %edx, %edx
  360.         jle     .L46            /* Bail if there's nothing to do. */
  361.  
  362.         /* If the source pointer isn't a multiple of 16 we have to process
  363.          * a few pixels the "slow" way to get the address aligned for
  364.          * the SSE fetch intsructions.
  365.          */
  366.  
  367.         negl    %eax
  368.         andl    $15, %eax
  369.         sarl    $2, %eax
  370.  
  371.         cmpl    %edx, %eax
  372.         cmovbe  %eax, %esi
  373.         subl    %esi, %edx
  374.  
  375.         testl   $1, %esi
  376.         je      .L41
  377.  
  378.         DO_ONE_PIXEL()  
  379. .L41:
  380.         testl   $2, %esi
  381.         je      .L40
  382.  
  383.         movq    (%ebx), %xmm0
  384.         addl    $8, %ebx
  385.  
  386.         movdqa  %xmm0, %xmm3
  387.         movdqa  %xmm0, %xmm4
  388.         andps   %xmm1, %xmm0
  389.  
  390.         andps   %xmm2, %xmm3
  391.         pslldq  $2, %xmm4
  392.         psrldq  $2, %xmm3
  393.         andps   %xmm2, %xmm4
  394.  
  395.         orps    %xmm4, %xmm3
  396.         orps    %xmm3, %xmm0
  397.  
  398.         movq    %xmm0, (%ecx)
  399.         addl    $8, %ecx
  400. .L40:
  401.  
  402.         /* Would it be worth having a specialized version of this loop for
  403.          * the case where the destination is 16-byte aligned?  That version
  404.          * would be identical except that it could use movedqa instead of
  405.          * movdqu.
  406.          */
  407.  
  408.         movl    %edx, %eax
  409.         shrl    $2, %eax
  410.         jmp     .L42
  411. .L43:
  412.         movdqa  (%ebx), %xmm0
  413.         addl    $16, %ebx
  414.  
  415.         movdqa  %xmm0, %xmm3
  416.         movdqa  %xmm0, %xmm4
  417.         andps   %xmm1, %xmm0
  418.  
  419.         andps   %xmm2, %xmm3
  420.         pslldq  $2, %xmm4
  421.         psrldq  $2, %xmm3
  422.         andps   %xmm2, %xmm4
  423.  
  424.         orps    %xmm4, %xmm3
  425.         orps    %xmm3, %xmm0
  426.  
  427.         movdqu  %xmm0, (%ecx)
  428.         addl    $16, %ecx
  429.         subl    $1, %eax
  430. .L42:
  431.         jne     .L43
  432.  
  433.  
  434.         /* There may be upto 3 pixels remaining to be copied.  Take care
  435.          * of them now.  We do the 2 pixel case first because the data
  436.          * will be aligned.
  437.          */
  438.  
  439.         testl   $2, %edx
  440.         je      .L47
  441.  
  442.         movq    (%ebx), %xmm0
  443.         addl    $8, %ebx
  444.        
  445.         movdqa  %xmm0, %xmm3
  446.         movdqa  %xmm0, %xmm4
  447.         andps   %xmm1, %xmm0
  448.  
  449.         andps   %xmm2, %xmm3
  450.         pslldq  $2, %xmm4
  451.         psrldq  $2, %xmm3
  452.         andps   %xmm2, %xmm4
  453.  
  454.         orps    %xmm4, %xmm3
  455.         orps    %xmm3, %xmm0
  456.  
  457.         movq    %xmm0, (%ecx)
  458.         addl    $8, %ecx        
  459. .L47:
  460.  
  461.         testl   $1, %edx
  462.         je      .L46
  463.  
  464.         DO_ONE_LAST_PIXEL()  
  465. .L46:
  466.  
  467.         popl    %ebx
  468.         popl    %esi
  469.         ret
  470.         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
  471.  
  472.  
  473.  
  474. #define MASK_565_L      0x07e0f800
  475. #define MASK_565_H      0x0000001f
  476. /* Setting SCALE_ADJUST to 5 gives a perfect match with the
  477.  * classic C implementation in Mesa.  Setting SCALE_ADJUST
  478.  * to 0 is slightly faster but at a small cost to accuracy.
  479.  */
  480. #define SCALE_ADJUST    5
  481. #if SCALE_ADJUST == 5
  482. #define PRESCALE_L 0x00100001
  483. #define PRESCALE_H 0x00000200
  484. #define SCALE_L 0x40C620E8
  485. #define SCALE_H 0x0000839d
  486. #elif SCALE_ADJUST == 0
  487. #define PRESCALE_L 0x00200001
  488. #define PRESCALE_H 0x00000800
  489. #define SCALE_L 0x01040108
  490. #define SCALE_H 0x00000108
  491. #else
  492. #error SCALE_ADJUST must either be 5 or 0.
  493. #endif
  494. #define ALPHA_L 0x00000000
  495. #define ALPHA_H 0x00ff0000
  496.  
  497. /**
  498.  * MMX optimized version of the RGB565 to RGBA copy routine.
  499.  */
  500.  
  501.         .text
  502.         .globl  _generic_read_RGBA_span_RGB565_MMX
  503. #ifndef USE_DRICORE
  504.         .hidden _generic_read_RGBA_span_RGB565_MMX
  505. #endif
  506.         .type   _generic_read_RGBA_span_RGB565_MMX, @function
  507.  
  508. _generic_read_RGBA_span_RGB565_MMX:
  509.  
  510. #ifdef USE_INNER_EMMS
  511.         emms
  512. #endif
  513.  
  514.         movl    4(%esp), %eax   /* source pointer */
  515.         movl    8(%esp), %edx   /* destination pointer */
  516.         movl    12(%esp), %ecx  /* number of pixels to copy */
  517.  
  518.         pushl   $MASK_565_H
  519.         pushl   $MASK_565_L
  520.         movq    (%esp), %mm5
  521.         pushl   $PRESCALE_H
  522.         pushl   $PRESCALE_L
  523.         movq    (%esp), %mm6
  524.         pushl   $SCALE_H
  525.         pushl   $SCALE_L
  526.         movq    (%esp), %mm7
  527.         pushl   $ALPHA_H
  528.         pushl   $ALPHA_L
  529.         movq    (%esp), %mm3
  530.         addl    $32,%esp
  531.  
  532.         sarl    $2, %ecx
  533.         jl      .L01            /* Bail early if the count is negative. */
  534.         jmp     .L02
  535.  
  536. .L03:
  537.         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
  538.          * second pixels into the four words of %mm0 and %mm2.
  539.          */
  540.  
  541.         movq    (%eax), %mm4
  542.         addl    $8, %eax
  543.  
  544.         pshufw  $0x00, %mm4, %mm0
  545.         pshufw  $0x55, %mm4, %mm2
  546.  
  547.  
  548.         /* Mask the pixels so that each word of each register contains only
  549.          * one color component.
  550.          */
  551.  
  552.         pand    %mm5, %mm0
  553.         pand    %mm5, %mm2
  554.  
  555.  
  556.         /* Adjust the component values so that they are as small as possible,
  557.          * but large enough so that we can multiply them by an unsigned 16-bit
  558.          * number and get a value as large as 0x00ff0000.
  559.          */
  560.  
  561.         pmullw  %mm6, %mm0
  562.         pmullw  %mm6, %mm2
  563. #if SCALE_ADJUST > 0
  564.         psrlw   $SCALE_ADJUST, %mm0
  565.         psrlw   $SCALE_ADJUST, %mm2
  566. #endif
  567.  
  568.         /* Scale the input component values to be on the range
  569.          * [0, 0x00ff0000].  This it the real magic of the whole routine.
  570.          */
  571.  
  572.         pmulhuw %mm7, %mm0
  573.         pmulhuw %mm7, %mm2
  574.  
  575.  
  576.         /* Always set the alpha value to 0xff.
  577.          */
  578.  
  579.         por %mm3, %mm0
  580.         por %mm3, %mm2
  581.  
  582.  
  583.         /* Pack the 16-bit values to 8-bit values and store the converted
  584.          * pixel data.
  585.          */
  586.  
  587.         packuswb        %mm2, %mm0
  588.         movq    %mm0, (%edx)
  589.         addl    $8, %edx
  590.  
  591.         pshufw  $0xaa, %mm4, %mm0
  592.         pshufw  $0xff, %mm4, %mm2
  593.  
  594.         pand    %mm5, %mm0
  595.         pand    %mm5, %mm2
  596.         pmullw  %mm6, %mm0
  597.         pmullw  %mm6, %mm2
  598. #if SCALE_ADJUST > 0
  599.         psrlw   $SCALE_ADJUST, %mm0
  600.         psrlw   $SCALE_ADJUST, %mm2
  601. #endif
  602.         pmulhuw %mm7, %mm0
  603.         pmulhuw %mm7, %mm2
  604.  
  605.         por %mm3, %mm0
  606.         por %mm3, %mm2
  607.  
  608.         packuswb        %mm2, %mm0
  609.  
  610.         movq    %mm0, (%edx)
  611.         addl    $8, %edx
  612.  
  613.         subl    $1, %ecx
  614. .L02:
  615.         jne     .L03
  616.  
  617.  
  618.         /* At this point there can be at most 3 pixels left to process.  If
  619.          * there is either 2 or 3 left, process 2.
  620.          */
  621.  
  622.         movl    12(%esp), %ecx
  623.         testl   $0x02, %ecx
  624.         je      .L04
  625.  
  626.         movd    (%eax), %mm4
  627.         addl    $4, %eax
  628.  
  629.         pshufw  $0x00, %mm4, %mm0
  630.         pshufw  $0x55, %mm4, %mm2
  631.  
  632.         pand    %mm5, %mm0
  633.         pand    %mm5, %mm2
  634.         pmullw  %mm6, %mm0
  635.         pmullw  %mm6, %mm2
  636. #if SCALE_ADJUST > 0
  637.         psrlw   $SCALE_ADJUST, %mm0
  638.         psrlw   $SCALE_ADJUST, %mm2
  639. #endif
  640.         pmulhuw %mm7, %mm0
  641.         pmulhuw %mm7, %mm2
  642.  
  643.         por %mm3, %mm0
  644.         por %mm3, %mm2
  645.  
  646.         packuswb        %mm2, %mm0
  647.  
  648.         movq    %mm0, (%edx)
  649.         addl    $8, %edx
  650.  
  651. .L04:
  652.         /* At this point there can be at most 1 pixel left to process.
  653.          * Process it if needed.
  654.          */
  655.  
  656.         testl   $0x01, %ecx
  657.         je      .L01
  658.  
  659.         movzwl  (%eax), %ecx
  660.         movd    %ecx, %mm4
  661.  
  662.         pshufw  $0x00, %mm4, %mm0
  663.  
  664.         pand    %mm5, %mm0
  665.         pmullw  %mm6, %mm0
  666. #if SCALE_ADJUST > 0
  667.         psrlw   $SCALE_ADJUST, %mm0
  668. #endif
  669.         pmulhuw %mm7, %mm0
  670.  
  671.         por %mm3, %mm0
  672.  
  673.         packuswb        %mm0, %mm0
  674.  
  675.         movd    %mm0, (%edx)
  676.  
  677. .L01:
  678. #ifdef USE_INNER_EMMS
  679.         emms
  680. #endif
  681.         ret
  682. #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
  683.        
  684. #if defined (__ELF__) && defined (__linux__)
  685.         .section .note.GNU-stack,"",%progbits
  686. #endif
  687.