Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * (C) Copyright IBM Corporation 2004
  3.  * All Rights Reserved.
  4.  *
  5.  * Permission is hereby granted, free of charge, to any person obtaining a
  6.  * copy of this software and associated documentation files (the "Software"),
  7.  * to deal in the Software without restriction, including without limitation
  8.  * on the rights to use, copy, modify, merge, publish, distribute, sub
  9.  * license, and/or sell copies of the Software, and to permit persons to whom
  10.  * the Software is furnished to do so, subject to the following conditions:
  11.  *
  12.  * The above copyright notice and this permission notice (including the next
  13.  * paragraph) shall be included in all copies or substantial portions of the
  14.  * Software.
  15.  *
  16.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18.  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19.  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20.  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21.  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22.  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23.  */
  24.  
  25. /**
  26.  * \file read_rgba_span_x86.S
  27.  * Optimized routines to transfer pixel data from the framebuffer to a
  28.  * buffer in main memory.
  29.  *
  30.  * \author Ian Romanick <idr@us.ibm.com>
  31.  */
  32.  
  33.         .file   "read_rgba_span_x86.S"
  34. #if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
  35. /* Kevin F. Quinn 2nd July 2006
  36.  * Replaced data segment constants with text-segment instructions.
  37.  */
  38. #define LOAD_MASK(mvins,m1,m2) \
  39.         pushl   $0xff00ff00 ;\
  40.         pushl   $0xff00ff00 ;\
  41.         pushl   $0xff00ff00 ;\
  42.         pushl   $0xff00ff00 ;\
  43.         mvins   (%esp), m1      ;\
  44.         pushl   $0x00ff0000 ;\
  45.         pushl   $0x00ff0000 ;\
  46.         pushl   $0x00ff0000 ;\
  47.         pushl   $0x00ff0000 ;\
  48.         mvins   (%esp), m2      ;\
  49.         addl    $32, %esp
  50.  
  51. /* I implemented these as macros because they appear in several places,
  52.  * and I've tweaked them a number of times.  I got tired of changing every
  53.  * place they appear. :)
  54.  */
  55.  
  56. #define DO_ONE_PIXEL() \
  57.         movl    (%ebx), %eax ; \
  58.         addl    $4, %ebx ; \
  59.         bswap   %eax          /* ARGB -> BGRA */ ; \
  60.         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  61.         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  62.         addl    $4, %ecx
  63.  
  64. #define DO_ONE_LAST_PIXEL() \
  65.         movl    (%ebx), %eax ; \
  66.         bswap   %eax          /* ARGB -> BGRA */ ; \
  67.         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  68.         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  69.  
  70.  
  71. /**
  72.  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  73.  *
  74.  * \warning
  75.  * This function assumes that the caller will issue the EMMS instruction
  76.  * at the correct places.
  77.  */
  78.  
  79. .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  80. .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
  81.         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  82. _generic_read_RGBA_span_BGRA8888_REV_MMX:
  83.         pushl   %ebx
  84.  
  85. #ifdef USE_INNER_EMMS
  86.         emms
  87. #endif
  88.         LOAD_MASK(movq,%mm1,%mm2)
  89.  
  90.         movl    8(%esp), %ebx   /* source pointer */
  91.         movl    16(%esp), %edx  /* number of pixels to copy */
  92.         movl    12(%esp), %ecx  /* destination pointer */
  93.  
  94.         testl   %edx, %edx
  95.         jle     .L20            /* Bail if there's nothing to do. */
  96.  
  97.         movl    %ebx, %eax
  98.  
  99.         negl    %eax
  100.         sarl    $2, %eax
  101.         andl    $1, %eax
  102.         je      .L17
  103.  
  104.         subl    %eax, %edx
  105.         DO_ONE_PIXEL()
  106. .L17:
  107.  
  108.         /* Would it be faster to unroll this loop once and process 4 pixels
  109.          * per pass, instead of just two?
  110.          */
  111.  
  112.         movl    %edx, %eax
  113.         shrl    %eax
  114.         jmp     .L18
  115. .L19:
  116.         movq    (%ebx), %mm0
  117.         addl    $8, %ebx
  118.  
  119.         /* These 9 instructions do what PSHUFB (if there were such an
  120.          * instruction) could do in 1. :(
  121.          */
  122.  
  123.         movq    %mm0, %mm3
  124.         movq    %mm0, %mm4
  125.  
  126.         pand    %mm2, %mm3
  127.         psllq   $16, %mm4
  128.         psrlq   $16, %mm3
  129.         pand    %mm2, %mm4
  130.  
  131.         pand    %mm1, %mm0
  132.         por     %mm4, %mm3
  133.         por     %mm3, %mm0
  134.  
  135.         movq    %mm0, (%ecx)
  136.         addl    $8, %ecx
  137.         subl    $1, %eax
  138. .L18:
  139.         jne     .L19
  140.  
  141. #ifdef USE_INNER_EMMS
  142.         emms
  143. #endif
  144.  
  145.         /* At this point there are either 1 or 0 pixels remaining to be
  146.          * converted.  Convert the last pixel, if needed.
  147.          */
  148.  
  149.         testl   $1, %edx
  150.         je      .L20
  151.  
  152.         DO_ONE_LAST_PIXEL()
  153.  
  154. .L20:
  155.         popl    %ebx
  156.         ret
  157.         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
  158.  
  159.  
  160. /**
  161.  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
  162.  * instructions are only actually used to read data from the framebuffer.
  163.  * In practice, the speed-up is pretty small.
  164.  *
  165.  * \todo
  166.  * Do some more testing and determine if there's any reason to have this
  167.  * function in addition to the MMX version.
  168.  *
  169.  * \warning
  170.  * This function assumes that the caller will issue the EMMS instruction
  171.  * at the correct places.
  172.  */
  173.  
  174. .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
  175. .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
  176.         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
  177. _generic_read_RGBA_span_BGRA8888_REV_SSE:
  178.         pushl   %esi
  179.         pushl   %ebx
  180.         pushl   %ebp
  181.  
  182. #ifdef USE_INNER_EMMS
  183.         emms
  184. #endif
  185.  
  186.         LOAD_MASK(movq,%mm1,%mm2)
  187.  
  188.         movl    16(%esp), %ebx  /* source pointer */
  189.         movl    24(%esp), %edx  /* number of pixels to copy */
  190.         movl    20(%esp), %ecx  /* destination pointer */
  191.  
  192.         testl   %edx, %edx
  193.         jle     .L35            /* Bail if there's nothing to do. */
  194.  
  195.         movl    %esp, %ebp
  196.         subl    $16, %esp
  197.         andl    $0xfffffff0, %esp
  198.  
  199.         movl    %ebx, %eax
  200.         movl    %edx, %esi
  201.  
  202.         negl    %eax
  203.         andl    $15, %eax
  204.         sarl    $2, %eax
  205.         cmpl    %edx, %eax
  206.         cmovle  %eax, %esi
  207.  
  208.         subl    %esi, %edx
  209.  
  210.         testl   $1, %esi
  211.         je      .L32
  212.  
  213.         DO_ONE_PIXEL()
  214. .L32:
  215.  
  216.         testl   $2, %esi
  217.         je      .L31
  218.  
  219.         movq    (%ebx), %mm0
  220.         addl    $8, %ebx
  221.  
  222.         movq    %mm0, %mm3
  223.         movq    %mm0, %mm4
  224.        
  225.         pand    %mm2, %mm3
  226.         psllq   $16, %mm4
  227.         psrlq   $16, %mm3
  228.         pand    %mm2, %mm4
  229.  
  230.         pand    %mm1, %mm0
  231.         por     %mm4, %mm3
  232.         por     %mm3, %mm0
  233.  
  234.         movq    %mm0, (%ecx)
  235.         addl    $8, %ecx
  236. .L31:
  237.  
  238.         movl    %edx, %eax
  239.         shrl    $2, %eax
  240.         jmp     .L33
  241. .L34:
  242.         movaps  (%ebx), %xmm0
  243.         addl    $16, %ebx
  244.  
  245.         /* This would be so much better if we could just move directly from
  246.          * an SSE register to an MMX register.  Unfortunately, that
  247.          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
  248.          * instruction.
  249.          */
  250.  
  251.         movaps  %xmm0, (%esp)
  252.         movq    (%esp), %mm0
  253.         movq    8(%esp), %mm5
  254.  
  255.         movq    %mm0, %mm3
  256.         movq    %mm0, %mm4
  257.         movq    %mm5, %mm6
  258.         movq    %mm5, %mm7
  259.  
  260.         pand    %mm2, %mm3
  261.         pand    %mm2, %mm6
  262.  
  263.         psllq   $16, %mm4
  264.         psllq   $16, %mm7
  265.  
  266.         psrlq   $16, %mm3
  267.         psrlq   $16, %mm6
  268.  
  269.         pand    %mm2, %mm4
  270.         pand    %mm2, %mm7
  271.  
  272.         pand    %mm1, %mm0
  273.         pand    %mm1, %mm5
  274.  
  275.         por     %mm4, %mm3
  276.         por     %mm7, %mm6
  277.  
  278.         por     %mm3, %mm0
  279.         por     %mm6, %mm5
  280.  
  281.         movq    %mm0, (%ecx)
  282.         movq    %mm5, 8(%ecx)
  283.         addl    $16, %ecx
  284.  
  285.         subl    $1, %eax
  286. .L33:
  287.         jne     .L34
  288.  
  289. #ifdef USE_INNER_EMMS
  290.         emms
  291. #endif
  292.         movl    %ebp, %esp
  293.  
  294.         /* At this point there are either [0, 3] pixels remaining to be
  295.          * converted.
  296.          */
  297.  
  298.         testl   $2, %edx
  299.         je      .L36
  300.  
  301.         movq    (%ebx), %mm0
  302.         addl    $8, %ebx
  303.  
  304.         movq    %mm0, %mm3
  305.         movq    %mm0, %mm4
  306.        
  307.         pand    %mm2, %mm3
  308.         psllq   $16, %mm4
  309.         psrlq   $16, %mm3
  310.         pand    %mm2, %mm4
  311.  
  312.         pand    %mm1, %mm0
  313.         por     %mm4, %mm3
  314.         por     %mm3, %mm0
  315.  
  316.         movq    %mm0, (%ecx)
  317.         addl    $8, %ecx
  318. .L36:
  319.  
  320.         testl   $1, %edx
  321.         je      .L35
  322.  
  323.         DO_ONE_LAST_PIXEL()
  324. .L35:
  325.         popl    %ebp
  326.         popl    %ebx
  327.         popl    %esi
  328.         ret
  329.         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
  330.  
  331.  
  332. /**
  333.  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
  334.  */
  335.  
  336.         .text
  337. .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
  338. .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
  339.         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
  340. _generic_read_RGBA_span_BGRA8888_REV_SSE2:
  341.         pushl   %esi
  342.         pushl   %ebx
  343.  
  344.         LOAD_MASK(movdqu,%xmm1,%xmm2)
  345.  
  346.         movl    12(%esp), %ebx  /* source pointer */
  347.         movl    20(%esp), %edx  /* number of pixels to copy */
  348.         movl    16(%esp), %ecx  /* destination pointer */
  349.  
  350.         movl    %ebx, %eax
  351.         movl    %edx, %esi
  352.  
  353.         testl   %edx, %edx
  354.         jle     .L46            /* Bail if there's nothing to do. */
  355.  
  356.         /* If the source pointer isn't a multiple of 16 we have to process
  357.          * a few pixels the "slow" way to get the address aligned for
  358.          * the SSE fetch intsructions.
  359.          */
  360.  
  361.         negl    %eax
  362.         andl    $15, %eax
  363.         sarl    $2, %eax
  364.  
  365.         cmpl    %edx, %eax
  366.         cmovbe  %eax, %esi
  367.         subl    %esi, %edx
  368.  
  369.         testl   $1, %esi
  370.         je      .L41
  371.  
  372.         DO_ONE_PIXEL()  
  373. .L41:
  374.         testl   $2, %esi
  375.         je      .L40
  376.  
  377.         movq    (%ebx), %xmm0
  378.         addl    $8, %ebx
  379.  
  380.         movdqa  %xmm0, %xmm3
  381.         movdqa  %xmm0, %xmm4
  382.         andps   %xmm1, %xmm0
  383.  
  384.         andps   %xmm2, %xmm3
  385.         pslldq  $2, %xmm4
  386.         psrldq  $2, %xmm3
  387.         andps   %xmm2, %xmm4
  388.  
  389.         orps    %xmm4, %xmm3
  390.         orps    %xmm3, %xmm0
  391.  
  392.         movq    %xmm0, (%ecx)
  393.         addl    $8, %ecx
  394. .L40:
  395.  
  396.         /* Would it be worth having a specialized version of this loop for
  397.          * the case where the destination is 16-byte aligned?  That version
  398.          * would be identical except that it could use movedqa instead of
  399.          * movdqu.
  400.          */
  401.  
  402.         movl    %edx, %eax
  403.         shrl    $2, %eax
  404.         jmp     .L42
  405. .L43:
  406.         movdqa  (%ebx), %xmm0
  407.         addl    $16, %ebx
  408.  
  409.         movdqa  %xmm0, %xmm3
  410.         movdqa  %xmm0, %xmm4
  411.         andps   %xmm1, %xmm0
  412.  
  413.         andps   %xmm2, %xmm3
  414.         pslldq  $2, %xmm4
  415.         psrldq  $2, %xmm3
  416.         andps   %xmm2, %xmm4
  417.  
  418.         orps    %xmm4, %xmm3
  419.         orps    %xmm3, %xmm0
  420.  
  421.         movdqu  %xmm0, (%ecx)
  422.         addl    $16, %ecx
  423.         subl    $1, %eax
  424. .L42:
  425.         jne     .L43
  426.  
  427.  
  428.         /* There may be upto 3 pixels remaining to be copied.  Take care
  429.          * of them now.  We do the 2 pixel case first because the data
  430.          * will be aligned.
  431.          */
  432.  
  433.         testl   $2, %edx
  434.         je      .L47
  435.  
  436.         movq    (%ebx), %xmm0
  437.         addl    $8, %ebx
  438.        
  439.         movdqa  %xmm0, %xmm3
  440.         movdqa  %xmm0, %xmm4
  441.         andps   %xmm1, %xmm0
  442.  
  443.         andps   %xmm2, %xmm3
  444.         pslldq  $2, %xmm4
  445.         psrldq  $2, %xmm3
  446.         andps   %xmm2, %xmm4
  447.  
  448.         orps    %xmm4, %xmm3
  449.         orps    %xmm3, %xmm0
  450.  
  451.         movq    %xmm0, (%ecx)
  452.         addl    $8, %ecx        
  453. .L47:
  454.  
  455.         testl   $1, %edx
  456.         je      .L46
  457.  
  458.         DO_ONE_LAST_PIXEL()  
  459. .L46:
  460.  
  461.         popl    %ebx
  462.         popl    %esi
  463.         ret
  464.         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
  465.  
  466.  
  467.  
  468. #define MASK_565_L      0x07e0f800
  469. #define MASK_565_H      0x0000001f
  470. /* Setting SCALE_ADJUST to 5 gives a perfect match with the
  471.  * classic C implementation in Mesa.  Setting SCALE_ADJUST
  472.  * to 0 is slightly faster but at a small cost to accuracy.
  473.  */
  474. #define SCALE_ADJUST    5
  475. #if SCALE_ADJUST == 5
  476. #define PRESCALE_L 0x00100001
  477. #define PRESCALE_H 0x00000200
  478. #define SCALE_L 0x40C620E8
  479. #define SCALE_H 0x0000839d
  480. #elif SCALE_ADJUST == 0
  481. #define PRESCALE_L 0x00200001
  482. #define PRESCALE_H 0x00000800
  483. #define SCALE_L 0x01040108
  484. #define SCALE_H 0x00000108
  485. #else
  486. #error SCALE_ADJUST must either be 5 or 0.
  487. #endif
  488. #define ALPHA_L 0x00000000
  489. #define ALPHA_H 0x00ff0000
  490.  
  491. /**
  492.  * MMX optimized version of the RGB565 to RGBA copy routine.
  493.  */
  494.  
  495.         .text
  496.         .globl  _generic_read_RGBA_span_RGB565_MMX
  497.         .hidden _generic_read_RGBA_span_RGB565_MMX
  498.         .type   _generic_read_RGBA_span_RGB565_MMX, @function
  499.  
  500. _generic_read_RGBA_span_RGB565_MMX:
  501.  
  502. #ifdef USE_INNER_EMMS
  503.         emms
  504. #endif
  505.  
  506.         movl    4(%esp), %eax   /* source pointer */
  507.         movl    8(%esp), %edx   /* destination pointer */
  508.         movl    12(%esp), %ecx  /* number of pixels to copy */
  509.  
  510.         pushl   $MASK_565_H
  511.         pushl   $MASK_565_L
  512.         movq    (%esp), %mm5
  513.         pushl   $PRESCALE_H
  514.         pushl   $PRESCALE_L
  515.         movq    (%esp), %mm6
  516.         pushl   $SCALE_H
  517.         pushl   $SCALE_L
  518.         movq    (%esp), %mm7
  519.         pushl   $ALPHA_H
  520.         pushl   $ALPHA_L
  521.         movq    (%esp), %mm3
  522.         addl    $32,%esp
  523.  
  524.         sarl    $2, %ecx
  525.         jl      .L01            /* Bail early if the count is negative. */
  526.         jmp     .L02
  527.  
  528. .L03:
  529.         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
  530.          * second pixels into the four words of %mm0 and %mm2.
  531.          */
  532.  
  533.         movq    (%eax), %mm4
  534.         addl    $8, %eax
  535.  
  536.         pshufw  $0x00, %mm4, %mm0
  537.         pshufw  $0x55, %mm4, %mm2
  538.  
  539.  
  540.         /* Mask the pixels so that each word of each register contains only
  541.          * one color component.
  542.          */
  543.  
  544.         pand    %mm5, %mm0
  545.         pand    %mm5, %mm2
  546.  
  547.  
  548.         /* Adjust the component values so that they are as small as possible,
  549.          * but large enough so that we can multiply them by an unsigned 16-bit
  550.          * number and get a value as large as 0x00ff0000.
  551.          */
  552.  
  553.         pmullw  %mm6, %mm0
  554.         pmullw  %mm6, %mm2
  555. #if SCALE_ADJUST > 0
  556.         psrlw   $SCALE_ADJUST, %mm0
  557.         psrlw   $SCALE_ADJUST, %mm2
  558. #endif
  559.  
  560.         /* Scale the input component values to be on the range
  561.          * [0, 0x00ff0000].  This it the real magic of the whole routine.
  562.          */
  563.  
  564.         pmulhuw %mm7, %mm0
  565.         pmulhuw %mm7, %mm2
  566.  
  567.  
  568.         /* Always set the alpha value to 0xff.
  569.          */
  570.  
  571.         por %mm3, %mm0
  572.         por %mm3, %mm2
  573.  
  574.  
  575.         /* Pack the 16-bit values to 8-bit values and store the converted
  576.          * pixel data.
  577.          */
  578.  
  579.         packuswb        %mm2, %mm0
  580.         movq    %mm0, (%edx)
  581.         addl    $8, %edx
  582.  
  583.         pshufw  $0xaa, %mm4, %mm0
  584.         pshufw  $0xff, %mm4, %mm2
  585.  
  586.         pand    %mm5, %mm0
  587.         pand    %mm5, %mm2
  588.         pmullw  %mm6, %mm0
  589.         pmullw  %mm6, %mm2
  590. #if SCALE_ADJUST > 0
  591.         psrlw   $SCALE_ADJUST, %mm0
  592.         psrlw   $SCALE_ADJUST, %mm2
  593. #endif
  594.         pmulhuw %mm7, %mm0
  595.         pmulhuw %mm7, %mm2
  596.  
  597.         por %mm3, %mm0
  598.         por %mm3, %mm2
  599.  
  600.         packuswb        %mm2, %mm0
  601.  
  602.         movq    %mm0, (%edx)
  603.         addl    $8, %edx
  604.  
  605.         subl    $1, %ecx
  606. .L02:
  607.         jne     .L03
  608.  
  609.  
  610.         /* At this point there can be at most 3 pixels left to process.  If
  611.          * there is either 2 or 3 left, process 2.
  612.          */
  613.  
  614.         movl    12(%esp), %ecx
  615.         testl   $0x02, %ecx
  616.         je      .L04
  617.  
  618.         movd    (%eax), %mm4
  619.         addl    $4, %eax
  620.  
  621.         pshufw  $0x00, %mm4, %mm0
  622.         pshufw  $0x55, %mm4, %mm2
  623.  
  624.         pand    %mm5, %mm0
  625.         pand    %mm5, %mm2
  626.         pmullw  %mm6, %mm0
  627.         pmullw  %mm6, %mm2
  628. #if SCALE_ADJUST > 0
  629.         psrlw   $SCALE_ADJUST, %mm0
  630.         psrlw   $SCALE_ADJUST, %mm2
  631. #endif
  632.         pmulhuw %mm7, %mm0
  633.         pmulhuw %mm7, %mm2
  634.  
  635.         por %mm3, %mm0
  636.         por %mm3, %mm2
  637.  
  638.         packuswb        %mm2, %mm0
  639.  
  640.         movq    %mm0, (%edx)
  641.         addl    $8, %edx
  642.  
  643. .L04:
  644.         /* At this point there can be at most 1 pixel left to process.
  645.          * Process it if needed.
  646.          */
  647.  
  648.         testl   $0x01, %ecx
  649.         je      .L01
  650.  
  651.         movzwl  (%eax), %ecx
  652.         movd    %ecx, %mm4
  653.  
  654.         pshufw  $0x00, %mm4, %mm0
  655.  
  656.         pand    %mm5, %mm0
  657.         pmullw  %mm6, %mm0
  658. #if SCALE_ADJUST > 0
  659.         psrlw   $SCALE_ADJUST, %mm0
  660. #endif
  661.         pmulhuw %mm7, %mm0
  662.  
  663.         por %mm3, %mm0
  664.  
  665.         packuswb        %mm0, %mm0
  666.  
  667.         movd    %mm0, (%edx)
  668.  
  669. .L01:
  670. #ifdef USE_INNER_EMMS
  671.         emms
  672. #endif
  673.         ret
  674. #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
  675.        
  676. #if defined (__ELF__) && defined (__linux__)
  677.         .section .note.GNU-stack,"",%progbits
  678. #endif
  679.