WebSVN – Kolibri OS – Path Comparison – / – /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/ Rev 3768 and /drivers/video/i965/shaders/post_processing/gen5_6/Core

Regard whitespace Rev 3768 → Rev 3769

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/AVS_IEF.inc
 ,0 → 1,108
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: AVS_IEF.inc
+#ifndef _AVS_INF_INC_
+#define _AVS_INF_INC_
+#include "undefall.inc"             //Undefine the SRC and DEST sysmbols
+        // Message Header
+        // m0.7         31:0    Debug
+        // m0.6         31:0    Debug
+        // m0.5         31:0    Ignored
+        // m0.4         31:0    Ignored
+        // m0.3         31:0    Ignored
+        // m0.2         31:16   Ignored
+        //              15      Alpha Write Channel Mask        enable=0, disable=1
+        //              14      Blue Write Channel Mask  (V)
+        //              13      Green Write Channel Mask (Y)
+        //              12      Red Write Channel Mask   (U)
+        //              11:0    Ignored
+        // m0.1                 Ignored
+        // m0.0                 Ignored
+#define mAVS_8x8_HDR   m0               // Message Header
+#define mAVS_PAYLOAD   m1               // Message Payload Header
+#define mAVS_8x8_HDR_2   m2               // Message Header
+#define mAVS_PAYLOAD_2   m3               // Message Payload Header
+#define mAVS_8x8_HDR_UV   m2               // Message Header
+#define mAVS_PAYLOAD_UV   m3               // Message Payload Header
+#define rAVS_8x8_HDR   rMSGSRC          // Mirror of Message Header
+#define rAVS_PAYLOAD   r9               // Mirror of Message Payload Header
+        // AVS payload
+        // m1.7                 Ignored
+        // m1.6                 Pixel 0 V Address       ---> ORIY (Y0)
+        // m1.5                 Delta V                 ---> Step Y
+        // m1.4                 Ignored
+        // m1.3                 Ignored
+        // m1.2                 Pixel 0 U Address       ---> ORIX (X0)
+        // m1.1                 U 2nd Derivative        ---> NLAS dx
+        // m1.0                 Delta U                 ---> Step X
+        // Sampler Message Descriptor
+        // 31:29        Reserved                        000
+        // 28:25        Message length                  0010
+        // 24:20        Response length                 xxxxx   ---> 4GRFs for each enabled channel
+        // 19           Header Present                  1
+        // 18           MBZ                             0
+        // 17:16        SIMD Mode                       11      ---> SIMD64
+        // 15:12        Message Type                    0011    ---> sample_8x8
+        // 11:8         Sampler Index                   xxxx
+        // 7:0          Binding Table Index             xxxxxxxx
+#define nAVS_MSG_DSC_1CH        0x044BB000
+#define nAVS_MSG_DSC_2CH        0x048BB000
+#define nAVS_MSG_DSC_3CH        0x04CBB000
+#define nAVS_MSG_DSC_4CH        0x050BB000
+#define nAVS_RED_CHANNEL_ONLY   0x0000E000      // Enable Red channel only
+#define nAVS_GREEN_CHANNEL_ONLY 0x0000D000      // Enable Green channel only
+#define nAVS_RED_BLUE_CHANNELS  0x0000A000      // Enable Red and Blue channels
+#define nAVS_RGB_CHANNELS       0x00008000      // Enable RGB(YUV) channels
+#define nAVS_ALL_CHANNELS       0x00000000      // Enable all channels (ARGB\AYUV)
+.declare     ubAVS_RESPONSE  Base=REG(r,nTEMP8) ElementSize=1  SrcRegion=REGION(16,1) Type=ub
+.declare     uwAVS_RESPONSE  Base=REG(r,nTEMP8) ElementSize=2  SrcRegion=REGION(16,1) Type=uw
+.declare     ubAVS_RESPONSE_2  Base=REG(r,nTEMP24) ElementSize=1  SrcRegion=REGION(16,1) Type=ub
+.declare     uwAVS_RESPONSE_2  Base=REG(r,nTEMP24) ElementSize=2  SrcRegion=REGION(16,1) Type=uw
+#if (nSRC_REGION==nREGION_2)
+    #define uwDEST_Y        uwBOT_Y
+    #define uwDEST_U        uwBOT_U
+    #define uwDEST_V        uwBOT_V
+    #define ubDEST_Y        ubBOT_Y
+    #undef  nSRC_REGION
+    #define nSRC_REGION nREGION_2
+#else //(nSRC_REGION==nREGION_1)
+    #define uwDEST_Y        uwTOP_Y
+    #define uwDEST_U        uwTOP_U
+    #define uwDEST_V        uwTOP_V
+    #define ubDEST_Y        ubTOP_Y
+    #undef  nSRC_REGION
+    #define nSRC_REGION     nREGION_1
+#endif
+#endif //_AVS_INF_INC_

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/AVS_SetupFirstBlock.asm
 ,0 → 1,35
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//------------------------------------------------------------------------------
+// AVS_SetupFirstBlock.asm
+//------------------------------------------------------------------------------
+    // Setup Message Header
+//    mov (8) mAVS_8x8_HDR<1>:ud      rMSGSRC<8;8,1>:ud
+    // Check  NLAS Enable bit
+    and.z.f0.0  (1)     wNULLREG                uwNLAS_ENABLE:uw        BIT15:uw
+    (f0.0)mov   (1) fVIDEO_STEP_DELTA:f     0.0:f
+    // Setup Message Payload Header for 1st block of Media Sampler 8x8
+    mov (1) rAVS_PAYLOAD.0:f        fVIDEO_STEP_DELTA:f     //NLAS dx
+    mov (1) rAVS_PAYLOAD.1:f        fVIDEO_STEP_X:f         //Step X
+    mov (1) rAVS_PAYLOAD.5:f        fVIDEO_STEP_Y:f         //Step Y
+    mov (2) rAVS_PAYLOAD.2<4>:f     fSRC_VID_H_ORI<2;2,1>:f //Orig X and Y

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/AVS_SetupSecondBlock.asm
 ,0 → 1,27
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//------------------------------------------------------------------------------
+// AVS_SetupSecondBlock.asm
+//------------------------------------------------------------------------------
+    //NLAS calculations for 2nd block of Media Sampler 8x8:
+    // X(i) = X0 + dx*i + ddx*i*(i-1)/2   ==>  X(8) = X0 + dx*8 +ddx*28
+    // dx(i)= dx(0) + ddx*i               ==>  dx(8)= dx + ddx*8
+    // Calculating X(8)
+    mov (1)   acc0.2<1>:f           fSRC_VID_H_ORI:f
+    mac (1)   acc0.2<1>:f           fVIDEO_STEP_X:f          8.0:f
+    mac (1)   rAVS_PAYLOAD.2:f      fVIDEO_STEP_DELTA:f      28.0:f
+    // Calculating dx(8)
+    mov (1)   acc0.1<1>:f           fVIDEO_STEP_X:f
+    mac (1)   rAVS_PAYLOAD.1:f      fVIDEO_STEP_DELTA:f      8.0:f

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/DI.inc
 ,0 → 1,194
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: DI.inc
+#ifdef GT
+// GT DI Kernel
+#else // ILK
+// ILK DI Kernel
+#endif
+//---------------------------------------------------------------------------
+// Binding table indices
+//---------------------------------------------------------------------------
+#define nBIDX_DI_PRV            10              // Previous DI-ed frame
+#define nBIDX_DI_CUR            13              // Current DI-ed frame
+#define nBIDX_DN                        7               // Denoised frame
+#define nBIDX_STAT                      20              // Statistics
+#define nBIDX_DI_Source  4  // Source Surface
+//---------------------------------------------------------------------------
+// Message descriptors
+//---------------------------------------------------------------------------
+// Extended message descriptor
+#define nSMPL_ENGINE            0x2
+#define nDATAPORT_WRITE         0x5
+#define nTS_EOT                         0x27    // with End-Of-Thread bit ON
+                // Message descriptor for end-of-thread
+                //                                              = 000 0001 (message len) 00000 (resp len)
+                //                                                0 (header present 0) 00000000000000 0 (URB dereferenced) 0000
+#define nEOT_MSGDSC                     0x02000000
+                // Message descriptor for sampler read
+                //                                              = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)
+                //                                                1 (header present 1) 0 11 (SIMD32/64 mode)
+                //                                                1000 (message type) 0000 (DI state index)
+                //                                                00000000 (binding table index - set later)
+                //                                              = 0x040b8000
+// comment begin
+// The following is commented out because of walker feature
+// It corresponds to the #ifdef GT #else and #endif
+//#define nSMPL_MSGDSC              0x040b8000
+//#define nSMPL_RESP_LEN_DI         0x00c00000          // 12
+//#define nSMPL_RESP_LEN_NODI_PL  0x00500000            // 5
+//#define nSMPL_RESP_LEN_NODI_PA  0x00900000            // 9
+//#define nSMPL_RESP_LEN_NODN       0x00900000          // 9
+//#define nSMPL_RESP_LEN_PDI        0x00b00000          // 11
+// comment end
+#ifdef GT
+#define nSMPL_MSGDSC                0x040b8000
+#define nSMPL_RESP_LEN_DI           0x00c00000          // 12
+#define nSMPL_RESP_LEN_NODI_PL  0x00500000              // 5  //DI disable, the XY stored in 5th GRF, no impact to return length
+#define nSMPL_RESP_LEN_NODI_PA  0x00900000              // 9  //DI disable, the XY stored in 5th GRF, no impact to return length
+#define nSMPL_RESP_LEN_NODN         0x00a00000          // 10 //NO DN, originally use 9, now we need use 10 to store the XY with walker
+#define nSMPL_RESP_LEN_PDI          0x00b00000          // 11
+#else
+#define nSMPL_MSGDSC                0x040b8000
+#define nSMPL_RESP_LEN_DI           0x00c00000          // 12
+#define nSMPL_RESP_LEN_NODI_PL  0x00500000              // 5
+#define nSMPL_RESP_LEN_NODI_PA  0x00900000              // 9
+#define nSMPL_RESP_LEN_NODN         0x00900000          // 9
+#define nSMPL_RESP_LEN_PDI          0x00b00000          // 11
+#endif
+                // Message descriptor for dataport media write
+#ifdef GT
+                //                                              = 000 0000 (message len - set later) 00000 (resp len 0)
+                //                                                1 (header present 1) 0 0 1010 (media block write) 00000
+                //                                                00000000 (binding table index - set later)
+                //                                              = 0x00094000
+#define nDPMW_MSGDSC                0x00094000
+#else // ILK
+                //                                              = 000 0000 (message len - set later) 00000 (resp len 0)
+                //                                                1 (header present 1) 000 0 010 (media block write) 0000
+                //                                                00000000 (binding table index - set later)
+                //                                              = 0x00082000
+#define nDPMW_MSGDSC                0x00082000
+#endif
+#define nDPMW_MSG_LEN_STMM          0x04000000          // 2 - STMM
+#define nDPMW_MSG_LEN_DH            0x04000000          // 2 - Denoise history
+#define nDPMW_MSG_LEN_PA_DN         0x0a000000          // 5 - Denoised output
+#define nDPMW_MSG_LEN_PA_NODI   0x12000000              // 9 - Denoised output - denoise only - DI disabled
+#define nDPMW_MSG_LEN_PL_DN         0x06000000          // 3 - Denoised output
+#define nDPMW_MSG_LEN_PL_NODI   0x0a000000              // 5 - Denoised output - denoise only - DI disabled
+#define nDPMW_MSG_LEN_DI            0x0a000000          // 5 - DI output
+//---------------------------------------------------------------------------
+// Static and inline parameters
+//---------------------------------------------------------------------------
+// Static parameters
+.declare ubTFLD_FIRST           Base=r1.27      ElementSize=1 Type=ub   // top field first
+.declare ubSRCYUVOFFSET         Base=r1.4       ElementSize=1 Type=ub   // source packed format
+.declare ubDSTYUVOFFSET         Base=r1.8       ElementSize=1 Type=ub   // destination packed format
+.declare uwSPITCH_DIV2          Base=r1.10      ElementSize=2 Type=uw   // statistics surface pitch divided by 2
+// Inline parameters
+.declare uwXORIGIN                      Base=r5.0       ElementSize=2 Type=uw   // X and Y origin
+.declare uwYORIGIN                      Base=r5.1       ElementSize=2 Type=uw
+//---------------------------------------------------------------------------
+// Kernel GRF variables
+//---------------------------------------------------------------------------
+// Message response (Denoised & DI-ed pixels & statistics)
+.declare dRESP                                          Base=r8         ElementSize=4 Type=d    // Response message (12 or 5 or 11)
+.declare ubRESP                                         Base=r8         ElementSize=1 Type=ub
+.declare dSTMM                                          Base=r16        ElementSize=4 Type=d    // STMM
+.declare ubDN_HIST_NODI         Base=r12        ElementSize=1 Type=ub   // Denoise history data (DI disabled)
+.declare ubDN_HIST_DI                   Base=r17        ElementSize=1 Type=ub   // Denoise history data (DI enabled)
+.declare uwRETURNED_POSITION_DI Base=r17        ElementSize=2 Type=uw   // XY_Return_Data (DI enabled)
+.declare uwRETURNED_POSITION_DN Base=r12        ElementSize=2 Type=uw // XY_Return_Data (DI disabled)
+.declare ub1ST_FLD_DN                   Base=r12        ElementSize=1 Type=ub   // 1st field Denoised data (DI enabled)
+.declare d1ST_FLD_DN                    Base=r12        ElementSize=4 Type=d
+.declare ub2ND_FLD_DN                   Base=r18        ElementSize=1 Type=ub   // 2nd field Denoised data (DI enabled)
+.declare d2ND_FLD_DN                    Base=r18        ElementSize=4 Type=d
+.declare ubPRV_DI                                       Base=r8         ElementSize=1 Type=ub   // Previous frame DI (DI enabled)
+.declare ubCUR_DI                                       Base=r12        ElementSize=1 Type=ub   // Previous frame DI (DI enabled)
+// Packed denoised output
+.declare ubDN_YUV                                       Base=r22        ElementSize=1 Type=ub   // Denoised YUV422
+.declare dDN_YUV                                        Base=r22        ElementSize=4 Type=d
+#define  npDN_YUV                       704                                                                     // = 22*32 = 0x280
+// Packed DI output
+.declare dDI_YUV_PRV                    Base=r32        ElementSize=4 Type=d    // Previous frame DI output
+.declare dDI_YUV_CUR                    Base=r36        ElementSize=4 Type=d    // Current frame DI output
+#define  npDI_YUV                       1024                                                                    // = 32*32 = 0x
+// For packed output
+#define  p422_YOFFSET           a0.2
+#define  p422_UOFFSET           a0.3
+#define  p422_VOFFSET           a0.4
+#define  pDN_TFLDSRC            a0.6
+#define  pDN_BFLDSRC            a0.7
+#define  npRESP                         192                                                                     // = 6*32
+// Message source
+.declare udMSGSRC                                       Base=r70          ElementSize=4 Type=ud
+.declare uwMSGSRC                                       Base=r70          ElementSize=2 Type=uw
+.declare dMSGSRC          Base=r70    ElementSize=4 Type=d
+//---------------------------------------------------------------------------
+// Kernel MRF variables
+//---------------------------------------------------------------------------
+#define mMSGHDR_SMPL            m1                                                                      // Sampler response: m1~m2
+.declare mudMSGHDR_SMPL         Base=m1         ElementSize=4 Type=ud
+.declare muwMSGHDR_SMPL         Base=m1         ElementSize=2 Type=uw
+#define mMSGHDR_DN                      m3                                                                      // Denoise output: m3~m7 for PA, m3~m5 for PL
+.declare mdMSGHDR_DN            Base=m3         ElementSize=4 Type=d
+#define mMSGHDR_STAT            m8                                                                      // Statistics output: m8~m9
+.declare mdMSGHDR_STAT          Base=m8         ElementSize=4 Type=d
+.declare mubMSGHDR_STAT         Base=m8         ElementSize=1 Type=ub
+#define mMSGHDR_DI                      m10                                                                     // DI output: m10~m14
+.declare mdMSGHDR_DI            Base=m10        ElementSize=4 Type=d
+#define mMSGHDR_EOT                     m15                                                                     // EOT
+#ifdef GT
+#define MSGSRC
+#else
+#define MSGSRC                          null:ud
+#endif
+//---------------------------------------------------------------------------
+// End of thread instruction
+//---------------------------------------------------------------------------
+#ifdef GT
+#define END_THREAD                      send (8) null<1>:d mMSGHDR_EOT nTS_EOT nEOT_MSGDSC
+#else   // ILK
+#define END_THREAD                      send (8) null<1>:d mMSGHDR_EOT null:ud  nTS_EOT nEOT_MSGDSC
+#endif
+// end of DI.inc

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/DI_Hist_Save.asm
 ,0 → 1,24
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Write denoise history to memory
+shr (2)    rMSGSRC.0<1>:ud    wORIX<2;2,1>:w            2:w                      NODDCLR           // X,Y origin / 4
+add (1)    rMSGSRC.0<1>:ud    rMSGSRC.0<0;1,0>:ud       uwSPITCH_DIV2<0;1,0>:uw  NODDCLR_NODDCHK  // Add pitch to X origin
+mov (1)    rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_HIST:ud                            NODDCHK           // block width and height (4x2)
+mov (8)    mMSGHDR_HIST<1>:ud      rMSGSRC.0<8;8,1>:ud                   // message header
+mov (1)    mudMSGHDR_HIST(1)<1>    udRESP(nDI_HIST_OFFSET,0)<0;1,0>    // Move denoise history to MRF
+send (8)   dNULLREG    mMSGHDR_HIST    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_HIST+nBI_STMM_HISTORY_OUTPUT:ud

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/DI_SAVE_PA.asm
 ,0 → 1,56
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+    shl (1) rMSGSRC.0<1>:ud     wORIX<0;1,0>:w            1:w  NODDCLR             // H. block origin need to be doubled
+    mov (1) rMSGSRC.1<1>:ud     wORIY<0;1,0>:w                 NODDCLR_NODDCHK    // Block origin
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_DI:ud          NODDCHK             // Block width and height (32x8)
+        add (4) pCF_Y_OFFSET<1>:uw   ubDEST_CF_OFFSET<4;4,1>:ub   nDEST_YUV_REG*nGRFWIB:w    // Initial Y,U,V offset in YUV422 block
+        // Pack 2nd field Y
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+                mov     (16) r[pCF_Y_OFFSET, %1*nGRFWIB]<2>       ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+        // Pack 1st field Y
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+                mov     (16) r[pCF_Y_OFFSET, %1+4*nGRFWIB]<2>       ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+        // Pack 2nd field U
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_U_OFFSET,   %1*nGRFWIB]<4>  ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    }
+         // Pack 1st field U
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_U_OFFSET,   %1+4*nGRFWIB]<4>  ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    }
+        // Pack 2nd field V
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_V_OFFSET,   %1*nGRFWIB]<4>  ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>  //Vpixels
+    }
+        // Packs1st field V
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_V_OFFSET,   %1+4*nGRFWIB]<4>  ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>  //Vpixels
+    }
+    //save the previous frame
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+    $for(0; <4; 1) {
+            mov (8) mudMSGPAYLOAD(%1)<1>  udDEST_YUV(%1)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_DI+nBI_DESTINATION_1_YUV:ud
+    //save the current frame
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+    $for(0; <4; 1) {
+            mov (8) mudMSGPAYLOAD(%1)<1>  udDEST_YUV(%1+4)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_DI+nBI_DESTINATION_2_YUV:ud

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/DNDI.inc
 ,0 → 1,162
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: DI.inc
+#ifdef GT
+// GT DI Kernel
+#else // ILK
+// ILK DI Kernel
+#endif
+#include "undefall.inc"
+//---------------------------------------------------------------------------
+// Message descriptors
+//---------------------------------------------------------------------------
+// Extended message descriptor
+          // Message descriptor for sampler read
+//        //                      = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)
+//        //                        1 (header present 1) 0 11 (SIMD32/64 mode)
+//        //                        1000 (message type) 0000 (DI state index)
+//        //                        00000000 (binding table index - set later)
+//        //                      = 0x040b8000
+#define nSMPL_DI_MSGDSC           0x040b8000
+#define nSMPL_RESP_LEN_DNDI      nRESLEN_12      // 12 - for DN + DI Alg
+#define nSMPL_RESP_LEN_DN_PL     nRESLEN_5       // 5  - for DN Planar Alg
+#define nSMPL_RESP_LEN_DN_PA     nRESLEN_9       // 9  - for DN Packed Alg
+#define nSMPL_RESP_LEN_DI        nRESLEN_9       // 9  - for DI Only Alg
+#define nSMPL_RESP_LEN_PDI       nRESLEN_11      // 11 - for Partial DI Alg
+// Attention: The Message Length is The Number of GRFs with Data Only, without the Header
+#define nDPMW_MSG_LEN_STMM       nMSGLEN_1       // 1 - For STMM Save
+#define nDPMW_MSG_LEN_HIST       nMSGLEN_1       // 1 - For Denoise History Save
+#define nDPMW_MSG_LEN_PA_DN_DI   nMSGLEN_4       // 4 - For DN Curr Save
+#define nDPMW_MSG_LEN_PA_DN_NODI nMSGLEN_8       // 8 - For DN Curr Save (denoise only - DI disabled)
+#define nDPMW_MSG_LEN_PL_DN_DI   nMSGLEN_2       // 2 - For DN Curr Save
+#define nDPMW_MSG_LEN_PL_DN_NODI nMSGLEN_4       // 4 - For DN Curr Save (denoise only - DI disabled)
+#define nDPW_BLOCK_SIZE_STMM   nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4   // Y block size 8x4
+#undef  nDPW_BLOCK_SIZE_DI
+#undef  nDPW_MSG_SIZE_DI
+#define nDPW_BLOCK_SIZE_DI  nBLOCK_WIDTH_32+nBLOCK_HEIGHT_4
+#define nDPW_MSG_SIZE_DI    nMSGLEN_4
+//---------------------------------------------------------------------------
+// Kernel GRF variables
+//---------------------------------------------------------------------------
+// Defines for DI enabled
+#define nDI_PREV_FRAME_LUMA_OFFSET          0
+#define nDI_PREV_FRAME_CHROMA_OFFSET        2
+#define nDI_CURR_FRAME_LUMA_OFFSET          4
+#define nDI_CURR_FRAME_CHROMA_OFFSET        6
+#define nDI_STMM_OFFSET                     8
+#define nDI_HIST_OFFSET                     9
+#define nDI_CURR_2ND_FIELD_LUMA_OFFSET     10
+#define nDI_CURR_2ND_FIELD_CHROMA_OFFSET   11
+// Defines for DI disabled
+#define nNODI_LUMA_OFFSET                   0
+#define nNODI_HIST_OFFSET                   4
+#define nNODI_CHROMA_OFFSET                 5
+#ifdef DI_ENABLE
+    #define nHIST_OFFSET    nDI_HIST_OFFSET
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame)
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+#endif
+#ifdef DI_DISABLE
+    #define nHIST_OFFSET    nNODI_HIST_OFFSET
+#endif
+#if (nSRC_REGION==nREGION_2)
+    #define ub2SRC_Y      ub2BOT_Y
+    #define ub2SRC_U      ub2BOT_U
+    #define ub2SRC_V      ub2BOT_V
+    #define uwDEST_Y      uwBOT_Y
+    #define uwDEST_U      uwBOT_U
+    #define uwDEST_V      uwBOT_V
+    #define nDEST_YUV_REG nTOP_Y
+    #define udDEST_YUV    udTOP_Y_IO
+    #define nRESP         nTEMP0         // DI return message requires 12 GRFs
+    #define nDN_YUV       nTOP_Y         // Space for Packing DN for next run requires 8 GRFs
+    #undef  nSRC_REGION
+    #define nSRC_REGION   nREGION_2
+#else
+    #define ub2SRC_Y      ub2TOP_Y
+    #define ub2SRC_U      ub2TOP_U
+    #define ub2SRC_V      ub2TOP_V
+    #define uwDEST_Y      uwTOP_Y
+    #define uwDEST_U      uwTOP_U
+    #define uwDEST_V      uwTOP_V
+    #define nDEST_YUV_REG nBOT_Y
+    #define udDEST_YUV    udBOT_Y_IO
+    #define nRESP         nTEMP0         // DI return message requires 12 GRFs
+    #define nDN_YUV       nBOT_Y         // Space for Packing DN for next run requires 8 GRFs
+    #undef  nSRC_REGION
+    #define nSRC_REGION   nREGION_1    // REGION_1 will be the source region for first kernel
+#endif
+// Message response (Denoised & DI-ed pixels & statistics)
+.declare udRESP      Base=REG(r,nRESP) ElementSize=4 SrcRegion=REGION(8,1) DstRegion=<1> Type=ud
+.declare ubRESP      Base=REG(r,nRESP) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub
+// For Denoised Curr Output (Used as Priv in Next Run)
+.declare ubDN_YUV           Base=REG(r,nDN_YUV)    ElementSize=1 Type=ub
+.declare udDN_YUV           Base=REG(r,nDN_YUV)    ElementSize=4 Type=ud
+#define  npDN_YUV           nDN_YUV*nGRFWIB
+// For DI Process Output (1st and 2nd Frames Output)
+//.declare udDI_YUV_PRIV      Base=REG(r,nTEMP0)    ElementSize=4 Type=ud   // Previous frame DI output
+//.declare udDI_YUV_CURR      Base=REG(r,nTEMP0)    ElementSize=4 Type=ud   // Current frame DI output
+//#define  npDI_YUV           nTEMP0*nGRFWIB
+//---------------------------------------------------------------------------
+// Kernel MRF variables
+//---------------------------------------------------------------------------
+#define  mMSG_SMPL           m1                                              // Sampler Command is in: m1~m2
+.declare mudMSG_SMPL         Base=mMSG_SMPL         ElementSize=4 Type=ud
+.declare muwMSG_SMPL         Base=mMSG_SMPL         ElementSize=2 Type=uw
+#define mMSGHDR_DN           m1                                              // Denoise Output: m1~m9 for PA, m3~m5 for PL
+.declare mudMSGHDR_DN        Base=mMSGHDR_DN        ElementSize=4 Type=ud
+.declare mubMSGHDR_DN        Base=mMSGHDR_DN        ElementSize=1 Type=ub
+#define mMSGHDR_STMM         m11                                             // STMM Output: m11~m12
+.declare mudMSGHDR_STMM      Base=mMSGHDR_STMM      ElementSize=4 Type=ud
+#define mMSGHDR_HIST         m13                                             // HIST Output: m13~m14
+.declare mudMSGHDR_HIST      Base=mMSGHDR_HIST      ElementSize=1 Type=ud
+#define mMSGHDR_DI_1ST       m1                                              // DI output: m1~m5
+.declare mudMSGHDR_DI_1ST    Base=mMSGHDR_DI_1ST    ElementSize=4 Type=ud
+#define mMSGHDR_DI_2ND       m6                                              // DI output: m6~m10
+.declare mudMSGHDR_DI_2ND    Base=mMSGHDR_DI_2ND    ElementSize=4 Type=ud
+// end of DNDI.inc

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/DNDI_COMMAND.asm
 ,0 → 1,17
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Activate the DNDI send command
+mov (8)     mudMSG_SMPL(0)<1>        rMSGSRC.0<8;8,1>:ud    NODDCLR         // message header
+mov (1)     muwMSG_SMPL(1,4)<1>      wORIX<0;1,0>:w         NODDCLR_NODDCHK// horizontal origin
+mov (1)     muwMSG_SMPL(1,12)<1>     wORIY<0;1,0>:w         NODDCLR_NODDCHK         // vertical origin
+//mov (2)     muwMSG_SMPL(1,4)<2>      wORIX<2;2,1>:w       NODDCHK// problem during compile !! when using this line
+send (8)    udRESP(0)<1>    mMSG_SMPL  udDUMMY_NULL   nSMPL_ENGINE    nSMPL_DI_MSGDSC+nSMPL_RESP_LEN+nBI_CURRENT_SRC_YUV_HW_DI:ud

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/DNDI_Hist_Save.asm
 ,0 → 1,20
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Write denoise history to memory
+shr (2)    rMSGSRC.0<1>:ud    wORIX<2;2,1>:w            2:w                       NODDCLR         // X,Y origin / 4
+add (1)    rMSGSRC.0<1>:ud    rMSGSRC.0<0;1,0>:ud       uwSPITCH_DIV2<0;1,0>:uw   NODDCLR_NODDCHK// Add pitch to X origin
+mov (1)    rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_HIST:ud                             NODDCHK         // block width and height (4x2)
+mov (8)    mMSGHDR_HIST<1>:ud      rMSGSRC.0<8;8,1>:ud                   // message header
+mov (2)    mudMSGHDR_HIST(1)<1>    udRESP(nNODI_HIST_OFFSET,0)<2;2,1>    // Move denoise history to MRF
+send (8)   dNULLREG    mMSGHDR_HIST    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_HIST+nBI_STMM_HISTORY_OUTPUT:ud

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PA_AVS_IEF_16x8.asm
 ,0 → 1,26
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_16x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Sample.asm"
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:4:4 internal planar
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Unpack_16x8.asm"
+//------------------------------------------------------------------------------

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PA_AVS_IEF_8x4.asm
 ,0 → 1,25
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_8x4.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Sample.asm"
+//------------------------------------------------------------------------------
+// Unpacking sampler data to 4:2:0 internal planar
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Unpack_8x4.asm"
+//------------------------------------------------------------------------------

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PA_AVS_IEF_8x8.asm
 ,0 → 1,25
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_8x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Sample.asm"
+//------------------------------------------------------------------------------
+// Unpacking sampler data to 4:2:2 internal planar
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Unpack_8x8.asm"
+//------------------------------------------------------------------------------

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PA_AVS_IEF_Sample.asm
 ,0 → 1,34
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_Sample.asm ----------
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // Enable RGB(YUV) channels
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RGB_CHANNELS:ud
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_3CH+nSI_SRC_YUV+nBI_CURRENT_SRC_YUV
+    // Return YUV in 12 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    mov (16) mAVS_8x8_HDR_2.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR_2    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_3CH+nSI_SRC_YUV+nBI_CURRENT_SRC_YUV
+    // Return YUV in 12 GRFs

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PA_AVS_IEF_Unpack_16x8.asm
 ,0 → 1,288
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_Unpack_16x8.asm ----------
+#ifdef AVS_OUTPUT_16_BIT        //Output is packed in AVYU format
+// Move first 8x8 words of Y to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,1)<4>       uwAVS_RESPONSE(2,0)<4;4,1>
+    mov (4) uwDEST_Y(1,1)<4>       uwAVS_RESPONSE(2,8)<4;4,1>
+    mov (4) uwDEST_Y(4,1)<4>       uwAVS_RESPONSE(2,4)<4;4,1>
+    mov (4) uwDEST_Y(5,1)<4>       uwAVS_RESPONSE(2,12)<4;4,1>
+    mov (4) uwDEST_Y(8,1)<4>       uwAVS_RESPONSE(3,0)<4;4,1>
+    mov (4) uwDEST_Y(9,1)<4>       uwAVS_RESPONSE(3,8)<4;4,1>
+    mov (4) uwDEST_Y(12,1)<4>      uwAVS_RESPONSE(3,4)<4;4,1>
+    mov (4) uwDEST_Y(13,1)<4>      uwAVS_RESPONSE(3,12)<4;4,1>
+    mov (4) uwDEST_Y(16,1)<4>      uwAVS_RESPONSE(8,0)<4;4,1>
+    mov (4) uwDEST_Y(17,1)<4>      uwAVS_RESPONSE(8,8)<4;4,1>
+    mov (4) uwDEST_Y(20,1)<4>      uwAVS_RESPONSE(8,4)<4;4,1>
+    mov (4) uwDEST_Y(21,1)<4>      uwAVS_RESPONSE(8,12)<4;4,1>
+    mov (4) uwDEST_Y(24,1)<4>      uwAVS_RESPONSE(9,0)<4;4,1>
+    mov (4) uwDEST_Y(25,1)<4>      uwAVS_RESPONSE(9,8)<4;4,1>
+    mov (4) uwDEST_Y(28,1)<4>      uwAVS_RESPONSE(9,4)<4;4,1>
+    mov (4) uwDEST_Y(29,1)<4>      uwAVS_RESPONSE(9,12)<4;4,1>
+// Move first 8x8 words of U to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,0)<4>       uwAVS_RESPONSE(4,0)<4;4,1>
+    mov (4) uwDEST_Y(1,0)<4>       uwAVS_RESPONSE(4,8)<4;4,1>
+    mov (4) uwDEST_Y(4,0)<4>       uwAVS_RESPONSE(4,4)<4;4,1>
+    mov (4) uwDEST_Y(5,0)<4>       uwAVS_RESPONSE(4,12)<4;4,1>
+    mov (4) uwDEST_Y(8,0)<4>       uwAVS_RESPONSE(5,0)<4;4,1>
+    mov (4) uwDEST_Y(9,0)<4>       uwAVS_RESPONSE(5,8)<4;4,1>
+    mov (4) uwDEST_Y(12,0)<4>      uwAVS_RESPONSE(5,4)<4;4,1>
+    mov (4) uwDEST_Y(13,0)<4>      uwAVS_RESPONSE(5,12)<4;4,1>
+    mov (4) uwDEST_Y(16,0)<4>      uwAVS_RESPONSE(10,0)<4;4,1>
+    mov (4) uwDEST_Y(17,0)<4>      uwAVS_RESPONSE(10,8)<4;4,1>
+    mov (4) uwDEST_Y(20,0)<4>      uwAVS_RESPONSE(10,4)<4;4,1>
+    mov (4) uwDEST_Y(21,0)<4>      uwAVS_RESPONSE(10,12)<4;4,1>
+    mov (4) uwDEST_Y(24,0)<4>      uwAVS_RESPONSE(11,0)<4;4,1>
+    mov (4) uwDEST_Y(25,0)<4>      uwAVS_RESPONSE(11,8)<4;4,1>
+    mov (4) uwDEST_Y(28,0)<4>      uwAVS_RESPONSE(11,4)<4;4,1>
+    mov (4) uwDEST_Y(29,0)<4>      uwAVS_RESPONSE(11,12)<4;4,1>
+// Move first 8x8 words of V to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,2)<4>       uwAVS_RESPONSE(0,0)<4;4,1>
+    mov (4) uwDEST_Y(1,2)<4>       uwAVS_RESPONSE(0,8)<4;4,1>
+    mov (4) uwDEST_Y(4,2)<4>       uwAVS_RESPONSE(0,4)<4;4,1>
+    mov (4) uwDEST_Y(5,2)<4>       uwAVS_RESPONSE(0,12)<4;4,1>
+    mov (4) uwDEST_Y(8,2)<4>       uwAVS_RESPONSE(1,0)<4;4,1>
+    mov (4) uwDEST_Y(9,2)<4>       uwAVS_RESPONSE(1,8)<4;4,1>
+    mov (4) uwDEST_Y(12,2)<4>      uwAVS_RESPONSE(1,4)<4;4,1>
+    mov (4) uwDEST_Y(13,2)<4>      uwAVS_RESPONSE(1,12)<4;4,1>
+    mov (4) uwDEST_Y(16,2)<4>      uwAVS_RESPONSE(6,0)<4;4,1>
+    mov (4) uwDEST_Y(17,2)<4>      uwAVS_RESPONSE(6,8)<4;4,1>
+    mov (4) uwDEST_Y(20,2)<4>      uwAVS_RESPONSE(6,4)<4;4,1>
+    mov (4) uwDEST_Y(21,2)<4>      uwAVS_RESPONSE(6,12)<4;4,1>
+    mov (4) uwDEST_Y(24,2)<4>      uwAVS_RESPONSE(7,0)<4;4,1>
+    mov (4) uwDEST_Y(25,2)<4>      uwAVS_RESPONSE(7,8)<4;4,1>
+    mov (4) uwDEST_Y(28,2)<4>      uwAVS_RESPONSE(7,4)<4;4,1>
+    mov (4) uwDEST_Y(29,2)<4>      uwAVS_RESPONSE(7,12)<4;4,1>
+// Move first 8x8 words of A to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,3)<4>       0:uw
+    mov (4) uwDEST_Y(1,3)<4>       0:uw
+    mov (4) uwDEST_Y(4,3)<4>       0:uw
+    mov (4) uwDEST_Y(5,3)<4>       0:uw
+    mov (4) uwDEST_Y(8,3)<4>       0:uw
+    mov (4) uwDEST_Y(9,3)<4>       0:uw
+    mov (4) uwDEST_Y(12,3)<4>      0:uw
+    mov (4) uwDEST_Y(13,3)<4>      0:uw
+    mov (4) uwDEST_Y(16,3)<4>      0:uw
+    mov (4) uwDEST_Y(17,3)<4>      0:uw
+    mov (4) uwDEST_Y(20,3)<4>      0:uw
+    mov (4) uwDEST_Y(21,3)<4>      0:uw
+    mov (4) uwDEST_Y(24,3)<4>      0:uw
+    mov (4) uwDEST_Y(25,3)<4>      0:uw
+    mov (4) uwDEST_Y(28,3)<4>      0:uw
+    mov (4) uwDEST_Y(29,3)<4>      0:uw
+// Move second 8x8 words of Y to dest GRF
+    mov (4) uwDEST_Y(2,1)<4>       uwAVS_RESPONSE_2(2,0)<4;4,1>
+    mov (4) uwDEST_Y(3,1)<4>       uwAVS_RESPONSE_2(2,8)<4;4,1>
+    mov (4) uwDEST_Y(6,1)<4>       uwAVS_RESPONSE_2(2,4)<4;4,1>
+    mov (4) uwDEST_Y(7,1)<4>       uwAVS_RESPONSE_2(2,12)<4;4,1>
+    mov (4) uwDEST_Y(10,1)<4>      uwAVS_RESPONSE_2(3,0)<4;4,1>
+    mov (4) uwDEST_Y(11,1)<4>      uwAVS_RESPONSE_2(3,8)<4;4,1>
+    mov (4) uwDEST_Y(14,1)<4>      uwAVS_RESPONSE_2(3,4)<4;4,1>
+    mov (4) uwDEST_Y(15,1)<4>      uwAVS_RESPONSE_2(3,12)<4;4,1>
+    mov (4) uwDEST_Y(18,1)<4>      uwAVS_RESPONSE_2(8,0)<4;4,1>
+    mov (4) uwDEST_Y(19,1)<4>      uwAVS_RESPONSE_2(8,8)<4;4,1>
+    mov (4) uwDEST_Y(22,1)<4>      uwAVS_RESPONSE_2(8,4)<4;4,1>
+    mov (4) uwDEST_Y(23,1)<4>      uwAVS_RESPONSE_2(8,12)<4;4,1>
+    mov (4) uwDEST_Y(26,1)<4>      uwAVS_RESPONSE_2(9,0)<4;4,1>
+    mov (4) uwDEST_Y(27,1)<4>      uwAVS_RESPONSE_2(9,8)<4;4,1>
+    mov (4) uwDEST_Y(30,1)<4>      uwAVS_RESPONSE_2(9,4)<4;4,1>
+    mov (4) uwDEST_Y(31,1)<4>      uwAVS_RESPONSE_2(9,12)<4;4,1>
+// Move second 8x8 words of U to dest GRF
+    mov (4) uwDEST_Y(2,0)<4>       uwAVS_RESPONSE_2(4,0)<4;4,1>
+    mov (4) uwDEST_Y(3,0)<4>       uwAVS_RESPONSE_2(4,8)<4;4,1>
+    mov (4) uwDEST_Y(6,0)<4>       uwAVS_RESPONSE_2(4,4)<4;4,1>
+    mov (4) uwDEST_Y(7,0)<4>       uwAVS_RESPONSE_2(4,12)<4;4,1>
+    mov (4) uwDEST_Y(10,0)<4>      uwAVS_RESPONSE_2(5,0)<4;4,1>
+    mov (4) uwDEST_Y(11,0)<4>      uwAVS_RESPONSE_2(5,8)<4;4,1>
+    mov (4) uwDEST_Y(14,0)<4>      uwAVS_RESPONSE_2(5,4)<4;4,1>
+    mov (4) uwDEST_Y(15,0)<4>      uwAVS_RESPONSE_2(5,12)<4;4,1>
+    mov (4) uwDEST_Y(18,0)<4>      uwAVS_RESPONSE_2(10,0)<4;4,1>
+    mov (4) uwDEST_Y(19,0)<4>      uwAVS_RESPONSE_2(10,8)<4;4,1>
+    mov (4) uwDEST_Y(22,0)<4>      uwAVS_RESPONSE_2(10,4)<4;4,1>
+    mov (4) uwDEST_Y(23,0)<4>      uwAVS_RESPONSE_2(10,12)<4;4,1>
+    mov (4) uwDEST_Y(26,0)<4>      uwAVS_RESPONSE_2(11,0)<4;4,1>
+    mov (4) uwDEST_Y(27,0)<4>      uwAVS_RESPONSE_2(11,8)<4;4,1>
+    mov (4) uwDEST_Y(30,0)<4>      uwAVS_RESPONSE_2(11,4)<4;4,1>
+    mov (4) uwDEST_Y(31,0)<4>      uwAVS_RESPONSE_2(11,12)<4;4,1>
+// Move second 8x8 words of V to dest GRF
+    mov (4) uwDEST_Y(2,2)<4>       uwAVS_RESPONSE_2(0,0)<4;4,1>
+    mov (4) uwDEST_Y(3,2)<4>       uwAVS_RESPONSE_2(0,8)<4;4,1>
+    mov (4) uwDEST_Y(6,2)<4>       uwAVS_RESPONSE_2(0,4)<4;4,1>
+    mov (4) uwDEST_Y(7,2)<4>       uwAVS_RESPONSE_2(0,12)<4;4,1>
+    mov (4) uwDEST_Y(10,2)<4>      uwAVS_RESPONSE_2(1,0)<4;4,1>
+    mov (4) uwDEST_Y(11,2)<4>      uwAVS_RESPONSE_2(1,8)<4;4,1>
+    mov (4) uwDEST_Y(14,2)<4>      uwAVS_RESPONSE_2(1,4)<4;4,1>
+    mov (4) uwDEST_Y(15,2)<4>      uwAVS_RESPONSE_2(1,12)<4;4,1>
+    mov (4) uwDEST_Y(18,2)<4>      uwAVS_RESPONSE_2(6,0)<4;4,1>
+    mov (4) uwDEST_Y(19,2)<4>      uwAVS_RESPONSE_2(6,8)<4;4,1>
+    mov (4) uwDEST_Y(22,2)<4>      uwAVS_RESPONSE_2(6,4)<4;4,1>
+    mov (4) uwDEST_Y(23,2)<4>      uwAVS_RESPONSE_2(6,12)<4;4,1>
+    mov (4) uwDEST_Y(26,2)<4>      uwAVS_RESPONSE_2(7,0)<4;4,1>
+    mov (4) uwDEST_Y(27,2)<4>      uwAVS_RESPONSE_2(7,8)<4;4,1>
+    mov (4) uwDEST_Y(30,2)<4>      uwAVS_RESPONSE_2(7,4)<4;4,1>
+    mov (4) uwDEST_Y(31,2)<4>      uwAVS_RESPONSE_2(7,12)<4;4,1>
+// Move second 8x8 words of A to dest GRF
+    mov (4) uwDEST_Y(2,3)<4>       0:uw
+    mov (4) uwDEST_Y(3,3)<4>       0:uw
+    mov (4) uwDEST_Y(6,3)<4>       0:uw
+    mov (4) uwDEST_Y(7,3)<4>       0:uw
+    mov (4) uwDEST_Y(10,3)<4>      0:uw
+    mov (4) uwDEST_Y(11,3)<4>      0:uw
+    mov (4) uwDEST_Y(14,3)<4>      0:uw
+    mov (4) uwDEST_Y(15,3)<4>      0:uw
+    mov (4) uwDEST_Y(18,3)<4>      0:uw
+    mov (4) uwDEST_Y(19,3)<4>      0:uw
+    mov (4) uwDEST_Y(22,3)<4>      0:uw
+    mov (4) uwDEST_Y(23,3)<4>      0:uw
+    mov (4) uwDEST_Y(26,3)<4>      0:uw
+    mov (4) uwDEST_Y(27,3)<4>      0:uw
+    mov (4) uwDEST_Y(30,3)<4>      0:uw
+    mov (4) uwDEST_Y(31,3)<4>      0:uw
+/*      This section will be used if 16-bit output is needed in planar format -vK
+    // Move first 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0)<1>     uwAVS_RESPONSE(2,0)<8;4,1>
+    mov (8)  uwDEST_Y(1)<1>     uwAVS_RESPONSE(2,8)<8;4,1>
+    mov (8)  uwDEST_Y(2)<1>     uwAVS_RESPONSE(3,0)<8;4,1>
+    mov (8)  uwDEST_Y(3)<1>     uwAVS_RESPONSE(3,8)<8;4,1>
+    mov (8)  uwDEST_Y(4)<1>     uwAVS_RESPONSE(8,0)<8;4,1>
+    mov (8)  uwDEST_Y(5)<1>     uwAVS_RESPONSE(8,8)<8;4,1>
+    mov (8)  uwDEST_Y(6)<1>     uwAVS_RESPONSE(9,0)<8;4,1>
+    mov (8)  uwDEST_Y(7)<1>     uwAVS_RESPONSE(9,8)<8;4,1>
+    // Move first 8x8 words of V to dest GRF
+    mov (8) uwDEST_V(0)<1>      ubAVS_RESPONSE(0,0)<8;4,1>
+    mov (8) uwDEST_V(1)<1>      ubAVS_RESPONSE(0,8)<8;4,1>
+    mov (8) uwDEST_V(2)<1>      ubAVS_RESPONSE(1,0)<8;4,1>
+    mov (8) uwDEST_V(3)<1>      ubAVS_RESPONSE(1,8)<8;4,1>
+    mov (8) uwDEST_V(4)<1>      ubAVS_RESPONSE(6,0)<8;4,1>
+    mov (8) uwDEST_V(5)<1>      ubAVS_RESPONSE(6,8)<8;4,1>
+    mov (8) uwDEST_V(6)<1>      ubAVS_RESPONSE(7,0)<8;4,1>
+    mov (8) uwDEST_V(7)<1>      ubAVS_RESPONSE(7,8)<8;4,1>
+    // Move first 8x8 words of U to dest GRF
+    mov (8) uwDEST_U(0)<1>      ubAVS_RESPONSE(4,0)<8;4,1>
+    mov (8) uwDEST_U(1)<1>      ubAVS_RESPONSE(4,8)<8;4,1>
+    mov (8) uwDEST_U(2)<1>      ubAVS_RESPONSE(5,0)<8;4,1>
+    mov (8) uwDEST_U(3)<1>      ubAVS_RESPONSE(5,8)<8;4,1>
+    mov (8) uwDEST_U(4)<1>      ubAVS_RESPONSE(10,0)<8;4,1>
+    mov (8) uwDEST_U(5)<1>      ubAVS_RESPONSE(10,8)<8;4,1>
+    mov (8) uwDEST_U(6)<1>      ubAVS_RESPONSE(11,0)<8;4,1>
+    mov (8) uwDEST_U(7)<1>      ubAVS_RESPONSE(11,8)<8;4,1>
+    // Move second 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0,8)<1>     uwAVS_RESPONSE_2(2,0)<8;4,1>
+    mov (8)  uwDEST_Y(1,8)<1>     uwAVS_RESPONSE_2(2,8)<8;4,1>
+    mov (8)  uwDEST_Y(2,8)<1>     uwAVS_RESPONSE_2(3,0)<8;4,1>
+    mov (8)  uwDEST_Y(3,8)<1>     uwAVS_RESPONSE_2(3,8)<8;4,1>
+    mov (8)  uwDEST_Y(4,8)<1>     uwAVS_RESPONSE_2(8,0)<8;4,1>
+    mov (8)  uwDEST_Y(5,8)<1>     uwAVS_RESPONSE_2(8,8)<8;4,1>
+    mov (8)  uwDEST_Y(6,8)<1>     uwAVS_RESPONSE_2(9,0)<8;4,1>
+    mov (8)  uwDEST_Y(7,8)<1>     uwAVS_RESPONSE_2(9,8)<8;4,1>
+    // Move second 8x8 words of V to dest GRF
+    mov (8) uwDEST_V(0,8)<1>      ubAVS_RESPONSE_2(0,0)<8;4,1>
+    mov (8) uwDEST_V(1,8)<1>      ubAVS_RESPONSE_2(0,8)<8;4,1>
+    mov (8) uwDEST_V(2,8)<1>      ubAVS_RESPONSE_2(1,0)<8;4,1>
+    mov (8) uwDEST_V(3,8)<1>      ubAVS_RESPONSE_2(1,8)<8;4,1>
+    mov (8) uwDEST_V(4,8)<1>      ubAVS_RESPONSE_2(6,0)<8;4,1>
+    mov (8) uwDEST_V(5,8)<1>      ubAVS_RESPONSE_2(6,8)<8;4,1>
+    mov (8) uwDEST_V(6,8)<1>      ubAVS_RESPONSE_2(7,0)<8;4,1>
+    mov (8) uwDEST_V(7,8)<1>      ubAVS_RESPONSE_2(7,8)<8;4,1>
+    // Move second 8x8 words of U to dest GRF
+    mov (8) uwDEST_U(0,8)<1>      ubAVS_RESPONSE_2(4,0)<8;4,1>
+    mov (8) uwDEST_U(1,8)<1>      ubAVS_RESPONSE_2(4,8)<8;4,1>
+    mov (8) uwDEST_U(2,8)<1>      ubAVS_RESPONSE_2(5,0)<8;4,1>
+    mov (8) uwDEST_U(3,8)<1>      ubAVS_RESPONSE_2(5,8)<8;4,1>
+    mov (8) uwDEST_U(4,8)<1>      ubAVS_RESPONSE_2(10,0)<8;4,1>
+    mov (8) uwDEST_U(5,8)<1>      ubAVS_RESPONSE_2(10,8)<8;4,1>
+    mov (8) uwDEST_U(6,8)<1>      ubAVS_RESPONSE_2(11,0)<8;4,1>
+    mov (8) uwDEST_U(7,8)<1>      ubAVS_RESPONSE_2(11,8)<8;4,1>
+*/
+#else   /* OUTPUT_8_BIT */
+    // Move first 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0)<1>     ubAVS_RESPONSE(2,1)<16;4,2>
+    mov (8)  uwDEST_Y(1)<1>     ubAVS_RESPONSE(2,8+1)<16;4,2>
+    mov (8)  uwDEST_Y(2)<1>     ubAVS_RESPONSE(3,1)<16;4,2>
+    mov (8)  uwDEST_Y(3)<1>     ubAVS_RESPONSE(3,8+1)<16;4,2>
+    mov (8)  uwDEST_Y(4)<1>     ubAVS_RESPONSE(8,1)<16;4,2>
+    mov (8)  uwDEST_Y(5)<1>     ubAVS_RESPONSE(8,8+1)<16;4,2>
+    mov (8)  uwDEST_Y(6)<1>     ubAVS_RESPONSE(9,1)<16;4,2>
+    mov (8)  uwDEST_Y(7)<1>     ubAVS_RESPONSE(9,8+1)<16;4,2>
+    // Move first 8x8 words of V to dest GRF
+    mov (8) uwDEST_V(0)<1>      ubAVS_RESPONSE(0,1)<16;4,2>
+    mov (8) uwDEST_V(1)<1>      ubAVS_RESPONSE(0,8+1)<16;4,2>
+    mov (8) uwDEST_V(2)<1>      ubAVS_RESPONSE(1,1)<16;4,2>
+    mov (8) uwDEST_V(3)<1>      ubAVS_RESPONSE(1,8+1)<16;4,2>
+    mov (8) uwDEST_V(4)<1>      ubAVS_RESPONSE(6,1)<16;4,2>
+    mov (8) uwDEST_V(5)<1>      ubAVS_RESPONSE(6,8+1)<16;4,2>
+    mov (8) uwDEST_V(6)<1>      ubAVS_RESPONSE(7,1)<16;4,2>
+    mov (8) uwDEST_V(7)<1>      ubAVS_RESPONSE(7,8+1)<16;4,2>
+    // Move first 8x8 words of U to dest GRF
+    mov (8) uwDEST_U(0)<1>      ubAVS_RESPONSE(4,1)<16;4,2>
+    mov (8) uwDEST_U(1)<1>      ubAVS_RESPONSE(4,8+1)<16;4,2>
+    mov (8) uwDEST_U(2)<1>      ubAVS_RESPONSE(5,1)<16;4,2>
+    mov (8) uwDEST_U(3)<1>      ubAVS_RESPONSE(5,8+1)<16;4,2>
+    mov (8) uwDEST_U(4)<1>      ubAVS_RESPONSE(10,1)<16;4,2>
+    mov (8) uwDEST_U(5)<1>      ubAVS_RESPONSE(10,8+1)<16;4,2>
+    mov (8) uwDEST_U(6)<1>      ubAVS_RESPONSE(11,1)<16;4,2>
+    mov (8) uwDEST_U(7)<1>      ubAVS_RESPONSE(11,8+1)<16;4,2>
+    // Move second 8x8 words of Y to dest GRF
+    mov (8) uwDEST_Y(0,8)<1>          ubAVS_RESPONSE_2(2,1)<16;4,2>
+    mov (8) uwDEST_Y(1,8)<1>          ubAVS_RESPONSE_2(2,8+1)<16;4,2>
+    mov (8) uwDEST_Y(2,8)<1>          ubAVS_RESPONSE_2(3,1)<16;4,2>
+    mov (8) uwDEST_Y(3,8)<1>          ubAVS_RESPONSE_2(3,8+1)<16;4,2>
+    mov (8) uwDEST_Y(4,8)<1>          ubAVS_RESPONSE_2(8,1)<16;4,2>
+    mov (8) uwDEST_Y(5,8)<1>          ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) uwDEST_Y(6,8)<1>          ubAVS_RESPONSE_2(9,1)<16;4,2>
+    mov (8) uwDEST_Y(7,8)<1>          ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+    // Move second 8x8 words of V to dest GRF
+    mov (8) uwDEST_V(0,8)<1>          ubAVS_RESPONSE_2(0,1)<16;4,2>
+    mov (8) uwDEST_V(1,8)<1>          ubAVS_RESPONSE_2(0,8+1)<16;4,2>
+    mov (8) uwDEST_V(2,8)<1>          ubAVS_RESPONSE_2(1,1)<16;4,2>
+    mov (8) uwDEST_V(3,8)<1>          ubAVS_RESPONSE_2(1,8+1)<16;4,2>
+    mov (8) uwDEST_V(4,8)<1>          ubAVS_RESPONSE_2(6,1)<16;4,2>
+    mov (8) uwDEST_V(5,8)<1>          ubAVS_RESPONSE_2(6,8+1)<16;4,2>
+    mov (8) uwDEST_V(6,8)<1>          ubAVS_RESPONSE_2(7,1)<16;4,2>
+    mov (8) uwDEST_V(7,8)<1>          ubAVS_RESPONSE_2(7,8+1)<16;4,2>
+    // Move second 8x8 words of U to dest GRF
+    mov (8) uwDEST_U(0,8)<1>          ubAVS_RESPONSE_2(4,1)<16;4,2>
+    mov (8) uwDEST_U(1,8)<1>          ubAVS_RESPONSE_2(4,8+1)<16;4,2>
+    mov (8) uwDEST_U(2,8)<1>          ubAVS_RESPONSE_2(5,1)<16;4,2>
+    mov (8) uwDEST_U(3,8)<1>          ubAVS_RESPONSE_2(5,8+1)<16;4,2>
+    mov (8) uwDEST_U(4,8)<1>          ubAVS_RESPONSE_2(10,1)<16;4,2>
+    mov (8) uwDEST_U(5,8)<1>          ubAVS_RESPONSE_2(10,8+1)<16;4,2>
+    mov (8) uwDEST_U(6,8)<1>          ubAVS_RESPONSE_2(11,1)<16;4,2>
+    mov (8) uwDEST_U(7,8)<1>          ubAVS_RESPONSE_2(11,8+1)<16;4,2>
+#endif
+//------------------------------------------------------------------------------
+   // Re-define new number of lines
+   #undef nUV_NUM_OF_ROWS
+   #undef nY_NUM_OF_ROWS
+   #define nY_NUM_OF_ROWS      8
+   #define nUV_NUM_OF_ROWS     8

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PA_AVS_IEF_Unpack_8x4.asm
 ,0 → 1,77
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_Unpack_8x8.asm ----------
+// Yoni: In order to optimize unpacking, 3 methods are being checked:
+//  1. AVS_ORIGINAL
+//  2. AVS_ROUND_TO_8_BITS
+//  3. AVS_INDIRECT_ACCESS
+//
+// Only 1 method should stay in the code
+//#define AVS_ROUND_TO_8_BITS
+//#define AVS_INDIRECT_ACCESS
+    // Move first 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0)<1>     ubAVS_RESPONSE(2,1)<16;4,2>
+    mov (8)  uwDEST_Y(1)<1>     ubAVS_RESPONSE(2,8+1)<16;4,2>
+    mov (8)  uwDEST_Y(2)<1>     ubAVS_RESPONSE(3,1)<16;4,2>
+    mov (8)  uwDEST_Y(3)<1>     ubAVS_RESPONSE(3,8+1)<16;4,2>
+    mov (8)  uwDEST_Y(4)<1>     ubAVS_RESPONSE(8,1)<16;4,2>
+    mov (8)  uwDEST_Y(5)<1>     ubAVS_RESPONSE(8,8+1)<16;4,2>
+    mov (8)  uwDEST_Y(6)<1>     ubAVS_RESPONSE(9,1)<16;4,2>
+    mov (8)  uwDEST_Y(7)<1>     ubAVS_RESPONSE(9,8+1)<16;4,2>
+    // Move first 4x8 words of V to dest GRF
+    mov (4) uwDEST_V(0)<1>      ubAVS_RESPONSE(0,1)<16;2,4>
+    mov (4) uwDEST_V(0,8)<1>    ubAVS_RESPONSE(1,1)<16;2,4>
+    mov (4) uwDEST_V(1)<1>      ubAVS_RESPONSE(6,1)<16;2,4>
+    mov (4) uwDEST_V(1,8)<1>    ubAVS_RESPONSE(7,1)<16;2,4>
+    // Move first 4x8 words of U to dest GRF
+    mov (4) uwDEST_U(0)<1>      ubAVS_RESPONSE(4,1)<16;2,4>
+    mov (4) uwDEST_U(0,8)<1>    ubAVS_RESPONSE(5,1)<16;2,4>
+    mov (4) uwDEST_U(1)<1>      ubAVS_RESPONSE(10,1)<16;2,4>
+    mov (4) uwDEST_U(1,8)<1>    ubAVS_RESPONSE(11,1)<16;2,4>
+    // Move second 8x8 words of Y to dest GRF
+    mov (8) uwDEST_Y(0,8)<1>    ubAVS_RESPONSE_2(2,1)<16;4,2>
+    mov (8) uwDEST_Y(1,8)<1>    ubAVS_RESPONSE_2(2,8+1)<16;4,2>
+    mov (8) uwDEST_Y(2,8)<1>    ubAVS_RESPONSE_2(3,1)<16;4,2>
+    mov (8) uwDEST_Y(3,8)<1>    ubAVS_RESPONSE_2(3,8+1)<16;4,2>
+    mov (8) uwDEST_Y(4,8)<1>    ubAVS_RESPONSE_2(8,1)<16;4,2>
+    mov (8) uwDEST_Y(5,8)<1>    ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) uwDEST_Y(6,8)<1>    ubAVS_RESPONSE_2(9,1)<16;4,2>
+    mov (8) uwDEST_Y(7,8)<1>    ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+    // Move second 4x8 words of V to dest GRF
+    mov (4) uwDEST_V(0,4)<1>    ubAVS_RESPONSE_2(0,1)<16;2,4>
+    mov (4) uwDEST_V(0,12)<1>   ubAVS_RESPONSE_2(1,1)<16;2,4>
+    mov (4) uwDEST_V(1,4)<1>    ubAVS_RESPONSE_2(6,1)<16;2,4>
+    mov (4) uwDEST_V(1,12)<1>   ubAVS_RESPONSE_2(7,1)<16;2,4>
+    // Move second 4x8 words of U to dest GRF
+    mov (4) uwDEST_U(0,4)<1>    ubAVS_RESPONSE_2(4,1)<16;2,4>
+    mov (4) uwDEST_U(0,12)<1>   ubAVS_RESPONSE_2(5,1)<16;2,4>
+    mov (4) uwDEST_U(1,4)<1>    ubAVS_RESPONSE_2(10,1)<16;2,4>
+    mov (4) uwDEST_U(1,12)<1>   ubAVS_RESPONSE_2(11,1)<16;2,4>
+//------------------------------------------------------------------------------
+       // Re-define new number of lines
+       #undef nUV_NUM_OF_ROWS
+       #undef nY_NUM_OF_ROWS
+       #define nY_NUM_OF_ROWS      8
+       #define nUV_NUM_OF_ROWS     8

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PA_AVS_IEF_Unpack_8x8.asm
 ,0 → 1,93
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_Unpack_8x8.asm ----------
+// Yoni: In order to optimize unpacking, 3 methods are being checked:
+//  1. AVS_ORIGINAL
+//  2. AVS_ROUND_TO_8_BITS
+//  3. AVS_INDIRECT_ACCESS
+//
+// Only 1 method should stay in the code
+//#define AVS_ROUND_TO_8_BITS
+//#define AVS_INDIRECT_ACCESS
+    // Move first 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0)<1>     ubAVS_RESPONSE(2,1)<16;4,2>
+    mov (8)  uwDEST_Y(1)<1>     ubAVS_RESPONSE(2,8+1)<16;4,2>
+    mov (8)  uwDEST_Y(2)<1>     ubAVS_RESPONSE(3,1)<16;4,2>
+    mov (8)  uwDEST_Y(3)<1>     ubAVS_RESPONSE(3,8+1)<16;4,2>
+    mov (8)  uwDEST_Y(4)<1>     ubAVS_RESPONSE(8,1)<16;4,2>
+    mov (8)  uwDEST_Y(5)<1>     ubAVS_RESPONSE(8,8+1)<16;4,2>
+    mov (8)  uwDEST_Y(6)<1>     ubAVS_RESPONSE(9,1)<16;4,2>
+    mov (8)  uwDEST_Y(7)<1>     ubAVS_RESPONSE(9,8+1)<16;4,2>
+    // Move first 4x8 words of V to dest GRF
+    mov (4) uwDEST_V(0)<1>      ubAVS_RESPONSE(0,1)<16;2,4>
+    mov (4) uwDEST_V(0,8)<1>    ubAVS_RESPONSE(0,8+1)<16;2,4>
+    mov (4) uwDEST_V(1)<1>      ubAVS_RESPONSE(1,1)<16;2,4>
+    mov (4) uwDEST_V(1,8)<1>    ubAVS_RESPONSE(1,8+1)<16;2,4>
+    mov (4) uwDEST_V(2)<1>      ubAVS_RESPONSE(6,1)<16;2,4>
+    mov (4) uwDEST_V(2,8)<1>    ubAVS_RESPONSE(6,8+1)<16;2,4>
+    mov (4) uwDEST_V(3)<1>      ubAVS_RESPONSE(7,1)<16;2,4>
+    mov (4) uwDEST_V(3,8)<1>    ubAVS_RESPONSE(7,8+1)<16;2,4>
+    // Move first 4x8 words of U to dest GRF
+    mov (4) uwDEST_U(0)<1>      ubAVS_RESPONSE(4,1)<16;2,4>
+    mov (4) uwDEST_U(0,8)<1>    ubAVS_RESPONSE(4,8+1)<16;2,4>
+    mov (4) uwDEST_U(1)<1>      ubAVS_RESPONSE(5,1)<16;2,4>
+    mov (4) uwDEST_U(1,8)<1>    ubAVS_RESPONSE(5,8+1)<16;2,4>
+    mov (4) uwDEST_U(2)<1>      ubAVS_RESPONSE(10,1)<16;2,4>
+    mov (4) uwDEST_U(2,8)<1>    ubAVS_RESPONSE(10,8+1)<16;2,4>
+    mov (4) uwDEST_U(3)<1>      ubAVS_RESPONSE(11,1)<16;2,4>
+    mov (4) uwDEST_U(3,8)<1>    ubAVS_RESPONSE(11,8+1)<16;2,4>
+    // Move second 8x8 words of Y to dest GRF
+    mov (8) uwDEST_Y(0,8)<1>    ubAVS_RESPONSE_2(2,1)<16;4,2>
+    mov (8) uwDEST_Y(1,8)<1>    ubAVS_RESPONSE_2(2,8+1)<16;4,2>
+    mov (8) uwDEST_Y(2,8)<1>    ubAVS_RESPONSE_2(3,1)<16;4,2>
+    mov (8) uwDEST_Y(3,8)<1>    ubAVS_RESPONSE_2(3,8+1)<16;4,2>
+    mov (8) uwDEST_Y(4,8)<1>    ubAVS_RESPONSE_2(8,1)<16;4,2>
+    mov (8) uwDEST_Y(5,8)<1>    ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) uwDEST_Y(6,8)<1>    ubAVS_RESPONSE_2(9,1)<16;4,2>
+    mov (8) uwDEST_Y(7,8)<1>    ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+    // Move second 4x8 words of V to dest GRF
+    mov (4) uwDEST_V(0,4)<1>    ubAVS_RESPONSE_2(0,1)<16;2,4>
+    mov (4) uwDEST_V(0,12)<1>   ubAVS_RESPONSE_2(0,8+1)<16;2,4>
+    mov (4) uwDEST_V(1,4)<1>    ubAVS_RESPONSE_2(1,1)<16;2,4>
+    mov (4) uwDEST_V(1,12)<1>   ubAVS_RESPONSE_2(1,8+1)<16;2,4>
+    mov (4) uwDEST_V(2,4)<1>    ubAVS_RESPONSE_2(6,1)<16;2,4>
+    mov (4) uwDEST_V(2,12)<1>   ubAVS_RESPONSE_2(6,8+1)<16;2,4>
+    mov (4) uwDEST_V(3,4)<1>    ubAVS_RESPONSE_2(7,1)<16;2,4>
+    mov (4) uwDEST_V(3,12)<1>   ubAVS_RESPONSE_2(7,8+1)<16;2,4>
+    // Move second 4x8 words of U to dest GRF
+    mov (4) uwDEST_U(0,4)<1>    ubAVS_RESPONSE_2(4,1)<16;2,4>
+    mov (4) uwDEST_U(0,12)<1>   ubAVS_RESPONSE_2(4,8+1)<16;2,4>
+    mov (4) uwDEST_U(1,4)<1>    ubAVS_RESPONSE_2(5,1)<16;2,4>
+    mov (4) uwDEST_U(1,12)<1>   ubAVS_RESPONSE_2(5,8+1)<16;2,4>
+    mov (4) uwDEST_U(2,4)<1>    ubAVS_RESPONSE_2(10,1)<16;2,4>
+    mov (4) uwDEST_U(2,12)<1>   ubAVS_RESPONSE_2(10,8+1)<16;2,4>
+    mov (4) uwDEST_U(3,4)<1>    ubAVS_RESPONSE_2(11,1)<16;2,4>
+    mov (4) uwDEST_U(3,12)<1>   ubAVS_RESPONSE_2(11,8+1)<16;2,4>
+//------------------------------------------------------------------------------
+       // Re-define new number of lines
+       #undef nUV_NUM_OF_ROWS
+       #undef nY_NUM_OF_ROWS
+       #define nY_NUM_OF_ROWS      8
+       #define nUV_NUM_OF_ROWS     8

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PA_DNDI_ALG.asm
 ,0 → 1,139
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#define DI_ENABLE
+    #include "DNDI.inc"
+    #ifdef DI_ONLY
+                #undef  nSMPL_RESP_LEN
+                #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DI               // set the number of GRF
+        #else
+                #undef  nSMPL_RESP_LEN
+                #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF
+        #endif
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_32+nBLOCK_HEIGHT_4   // DN Block Size for Write is 32x4
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    //// move the previous frame Y component to internal planar format
+    //$for (0; <nY_NUM_OF_ROWS/2; 1) {
+    //    mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    //}
+    //// move the previous frame U,V components to internal planar format
+    //$for (0; <nUV_NUM_OF_ROWS/2; 1) {
+    //    mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    //    mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    //}
+    //// move the current frame Y component to internal planar format
+    //$for (0; <nY_NUM_OF_ROWS/2; 1) {
+    //    mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    //}
+    //// move the current frame U,V components to internal planar format
+    //$for (0; <nUV_NUM_OF_ROWS/2; 1) {
+    //    mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    //    mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    //}
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     NODDCLR          // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    NODDCLR_NODDCHK // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud          NODDCHK         // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+#ifdef DI_ONLY
+#else
+    #include "DI_Hist_Save.asm"
+////////////////////////////////////// Pack and Save the DN Curr Frame for Next Run ///////////////
+    // check top/bottom field first
+        cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:uw
+        //set the save DN position
+    shl (1)     rMSGSRC.0<1>:ud      wORIX<0;1,0>:w          1:w NODDCLR           // X origin * 2
+    mov (1)     rMSGSRC.1<1>:ud      wORIY<0;1,0>:w              NODDCLR_NODDCHK   // Y origin
+    mov (1)     rMSGSRC.2<1>:ud      nDPW_BLOCK_SIZE_DN:ud       NODDCHK             // block width and height (8x4)
+    mov (8)     mudMSGHDR_DN(0)<1>   rMSGSRC.0<8;8,1>:ud
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+BOTTOM_FIELD_FIRST:
+    //$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+    //    mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 0,2)
+    //    mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub   ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,16) // 1st field luma from current frame (line 1,3)
+    //    mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 0,2)
+    //    mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 0,2)
+    //    mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16+1)<16;8,2> // 1st field U from current frame (line 1,3)
+    //    mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16)<16;8,2> // 1st field U from current frame (line 1,3)
+    //}
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 0,2)
+        mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub   ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,16) // 1st field luma from current frame (line 1,3)
+    }
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 0,2)
+        mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16+1)<16;8,2> // 1st field U from current frame (line 1,3)
+    }
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 0,2)
+        mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16)<16;8,2> // 1st field U from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+TOP_FIELD_FIRST:
+    //$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+    //    mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub       ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0) // 1st field luma from current frame (line 0,2)
+    //    mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 1,3)
+    //    mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,1)<16;8,2> // 1st field U from current frame (line 0,2)
+    //    mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,0)<16;8,2> // 1st field V from current frame (line 0,2)
+    //    mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 1,3)
+    //    mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 1,3)
+    //}
+        $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub       ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0) // 1st field luma from current frame (line 0,2)
+        mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 1,3)
+    }
+        $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,1)<16;8,2> // 1st field U from current frame (line 0,2)
+        mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 1,3)
+    }
+        $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,0)<16;8,2> // 1st field V from current frame (line 0,2)
+        mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    $for(0; <nY_NUM_OF_ROWS/2; 1) {
+            mov (8) mudMSGHDR_DN(%1+1)<1>  udDN_YUV(%1)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PA_DN_DI+nBI_DESTINATION_YUV:ud
+#endif
+// Save Processed frames
+#include "DI_Save_PA.asm"

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PA_DN_ALG.asm
 ,0 → 1,54
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#define DI_DISABLE
+#include "DNDI.inc"
+#undef  nY_NUM_OF_ROWS
+#define nY_NUM_OF_ROWS         8                                 // Number of Y rows per block
+#undef  nUV_NUM_OF_ROWS
+#define nUV_NUM_OF_ROWS        8                                 // Number of U/V rows per block
+#undef   nSMPL_RESP_LEN
+#define  nSMPL_RESP_LEN        nSMPL_RESP_LEN_DN_PA              // Set the Number of GRFs in DNDI response
+#undef   nDPW_BLOCK_SIZE_DN
+#define  nDPW_BLOCK_SIZE_DN    nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8   // DN Curr Block Size for Write is 32x8
+#undef   nDPW_BLOCK_SIZE_HIST
+#define  nDPW_BLOCK_SIZE_HIST  nBLOCK_WIDTH_4+nBLOCK_HEIGHT_2    // HIST Block Size for Write is 4x2
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+#include "DNDI_COMMAND.asm"
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+#include "DNDI_Hist_Save.asm"
+////////////////////////////////////// Pack and Save the DN Curr Frame for Next Run ///////////////
+add (4)     pCF_Y_OFFSET<1>:uw    ubDEST_CF_OFFSET<4;4,1>:ub    npDN_YUV:w
+$for (0; <nY_NUM_OF_ROWS; 1) {
+    mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub   ubRESP(nNODI_LUMA_OFFSET,%1*16)<16;16,1>       // copy line of Y
+}
+$for (0; <nUV_NUM_OF_ROWS; 1) {
+    mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub   ubRESP(nNODI_CHROMA_OFFSET,%1*16+1)<16;8,2>    // copy line of U
+    mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub   ubRESP(nNODI_CHROMA_OFFSET,%1*16)<16;8,2>      // copy line of V
+}
+shl (1)     rMSGSRC.0<1>:ud     wORIX<0;1,0>:w     1:w       // X origin * 2 (422 output)
+mov (1)     rMSGSRC.1<1>:ud     wORIY<0;1,0>:w               // Y origin
+mov (1)     rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_DN:ud        // block width and height (32x8)
+mov (8)     mMSGHDR_DN<1>:ud    rMSGSRC<8;8,1>:ud            // message header
+$for(0; <nY_NUM_OF_ROWS; 2) {
+        mov (16) mudMSGHDR_DN(1+%1)<1>  udDN_YUV(%1)REGION(8,1)    // Move DN Curr to MRF
+}
+send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PA_DN_NODI+nBI_DESTINATION_YUV:ud

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PA_Scaling.asm
 ,0 → 1,70
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_Scaling.asm ----------
+#include "Scaling.inc"
+        // Build 16 elements ramp in float32 and normalized it
+//      mov (8)         SAMPLER_RAMP(0)<1>              0x76543210:v
+//      add     (8)             SAMPLER_RAMP(1)<1>              SAMPLER_RAMP(0) 8.0:f
+mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf                //3, 2, 1, 0 in float vector
+mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf      //7, 6, 5, 4 in float vector
+add     (8)             SAMPLER_RAMP(1)<1>              SAMPLER_RAMP(0) 8.0:f
+//Module: PrepareScaleCoord.asm
+        // Setup for sampler msg hdr
+    mov (2)             rMSGSRC.0<1>:ud                 0:ud                                            { NoDDClr }     // Unused fields
+    mov (1)             rMSGSRC.2<1>:ud                 0:ud                                            { NoDDChk }     // Write and offset
+        // Calculate 16 v based on the step Y and vertical origin
+        mov     (16)    mfMSGPAYLOAD(2)<1>              fSRC_VID_V_ORI<0;1,0>:f
+        mov     (16)    SCALE_COORD_Y<1>:f              fSRC_VID_V_ORI<0;1,0>:f
+        // Calculate 16 u based on the step X and hori origin
+//      line (16)       mfMSGPAYLOAD(0)<1>              SCALE_STEP_X<0;1,0>:f           SAMPLER_RAMP(0)         // Assign to mrf directly
+        mov     (16)    acc0:f                                                  fSRC_VID_H_ORI<0;1,0>:f                                                                                 { Compr }
+        mac     (16)    mfMSGPAYLOAD(0)<1>      fVIDEO_STEP_X<0;1,0>:f  SAMPLER_RAMP(0)                 { Compr }
+        //Setup the constants for line instruction
+        mov     (1)             SCALE_LINE_P255<1>:f            255.0:f                         { NoDDClr }     //{ NoDDClr, NoDDChk }
+        mov     (1)             SCALE_LINE_P0_5<1>:f            0.5:f                           { NoDDChk }
+//------------------------------------------------------------------------------
+$for (0; <nY_NUM_OF_ROWS; 1) {
+        // Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
+  mov (8)       MSGHDR_SCALE.0:ud      rMSGSRC.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+        send (16)       SCALE_RESPONSE_YW(0)<1>         MSGHDR_SCALE    udDUMMY_NULL    nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_YUV+nBI_CURRENT_SRC_YUV
+        // Calculate 16 v for next line
+        add (16)        mfMSGPAYLOAD(2)<1>              SCALE_COORD_Y<8;8,1>:f          fVIDEO_STEP_Y<0;1,0>:f  // Assign to mrf directly
+        add (16)        SCALE_COORD_Y<1>:f              SCALE_COORD_Y<8;8,1>:f          fVIDEO_STEP_Y<0;1,0>:f  // Assign to mrf directly
+        // Scale back to [0, 255], convert f to ud
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_YF(0)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_YD(0)<1>       acc0:f                                                                                                          { Compr }
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_YF(2)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_YD(2)<1>       acc0:f                                                                                                          { Compr }
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_YF(4)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_YD(4)<1>       acc0:f                                                                                                          { Compr }
+        mov      (16)   DEST_V(%1)<1>                           SCALE_RESPONSE_YB(0)                                                                                    //possible error due to truncation - vK
+        mov      (16)   DEST_Y(%1)<1>                           SCALE_RESPONSE_YB(2)                                                                                    //possible error due to truncation - vK
+        mov      (16)   DEST_U(%1)<1>                           SCALE_RESPONSE_YB(4)                                                                                    //possible error due to truncation - vK
+}
+        #define nSRC_REGION                             nREGION_1
+//------------------------------------------------------------------------------

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL2_AVS_IEF_16x8.asm
 ,0 → 1,60
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_16x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y each
+// 2 sampler read for 8x8 U and 8x8 V (NV11\P208 input surface)
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+    // 8x8 U and V sampling
+    // Enable red and blue channels
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    // 2nd 8x8 Y sampling
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>    mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // 2nd 8x8 U and V sampling
+    // Enable red and blue channels
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:4:4 internal planar
+//------------------------------------------------------------------------------
+    #include "PL2_AVS_IEF_Unpack_16x8.asm"

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL2_AVS_IEF_8x4.asm
 ,0 → 1,58
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_8x4.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y each
+// 1 sampler read for 8x8 U and 8x8 V (NV11\NV12 input surface)
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+    // 8x8 U and V sampling
+    // Enable red and blue channels
+    //Only 8x4 wil be used
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud
+    // Calculate Chroma Step Size:
+    // for H direction: 16 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_X = 2 * Luma_Step_X
+    // for V direction: 8  Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_Y = Luma_Step_Y
+    mul  (1)  rAVS_PAYLOAD.1:f      fVIDEO_STEP_X:f    2.0:f             // Step X for chroma
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    // 2nd 8x8 Y sampling
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>    mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:2:0 internal planar
+//------------------------------------------------------------------------------
+    #include "PL2_AVS_IEF_Unpack_8x4.asm"

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL2_AVS_IEF_8x8.asm
 ,0 → 1,57
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_8x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y each
+// 1 sampler read for 8x8 U and 8x8 V (NV11\NV12 input surface)
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+    // 8x8 U and V sampling
+    // Enable red and blue channels
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud
+    // Calculate Chroma Step Size:
+    // for H direction: 16 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_X = 2 * Luma_Step_X
+    // for V direction: 8  Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_Y = Luma_Step_Y
+    mul  (1)  rAVS_PAYLOAD.1:f      fVIDEO_STEP_X:f    2.0:f             // Step X for chroma
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    // 2nd 8x8 Y sampling
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>    mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:2:2 internal planar
+//------------------------------------------------------------------------------
+    #include "PL2_AVS_IEF_Unpack_8x8.asm"

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL2_AVS_IEF_Unpack_16x8.asm
 ,0 → 1,271
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_Unpack_16x8.asm ----------
+#ifdef AVS_OUTPUT_16_BIT        //Output is packed in AVYU format
+// Move first 8x8 words of Y to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,1)<4>       uwAVS_RESPONSE(0,0)<4;4,1>
+    mov (4) uwDEST_Y(1,1)<4>       uwAVS_RESPONSE(0,8)<4;4,1>
+    mov (4) uwDEST_Y(4,1)<4>       uwAVS_RESPONSE(0,4)<4;4,1>
+    mov (4) uwDEST_Y(5,1)<4>       uwAVS_RESPONSE(0,12)<4;4,1>
+    mov (4) uwDEST_Y(8,1)<4>       uwAVS_RESPONSE(1,0)<4;4,1>
+    mov (4) uwDEST_Y(9,1)<4>       uwAVS_RESPONSE(1,8)<4;4,1>
+    mov (4) uwDEST_Y(12,1)<4>      uwAVS_RESPONSE(1,4)<4;4,1>
+    mov (4) uwDEST_Y(13,1)<4>      uwAVS_RESPONSE(1,12)<4;4,1>
+    mov (4) uwDEST_Y(16,1)<4>      uwAVS_RESPONSE(2,0)<4;4,1>
+    mov (4) uwDEST_Y(17,1)<4>      uwAVS_RESPONSE(2,8)<4;4,1>
+    mov (4) uwDEST_Y(20,1)<4>      uwAVS_RESPONSE(2,4)<4;4,1>
+    mov (4) uwDEST_Y(21,1)<4>      uwAVS_RESPONSE(2,12)<4;4,1>
+    mov (4) uwDEST_Y(24,1)<4>      uwAVS_RESPONSE(3,0)<4;4,1>
+    mov (4) uwDEST_Y(25,1)<4>      uwAVS_RESPONSE(3,8)<4;4,1>
+    mov (4) uwDEST_Y(28,1)<4>      uwAVS_RESPONSE(3,4)<4;4,1>
+    mov (4) uwDEST_Y(29,1)<4>      uwAVS_RESPONSE(3,12)<4;4,1>
+// Move first 8x8 words of U to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,0)<4>       uwAVS_RESPONSE(4,0)<4;4,1>
+    mov (4) uwDEST_Y(1,0)<4>       uwAVS_RESPONSE(4,8)<4;4,1>
+    mov (4) uwDEST_Y(4,0)<4>       uwAVS_RESPONSE(4,4)<4;4,1>
+    mov (4) uwDEST_Y(5,0)<4>       uwAVS_RESPONSE(4,12)<4;4,1>
+    mov (4) uwDEST_Y(8,0)<4>       uwAVS_RESPONSE(5,0)<4;4,1>
+    mov (4) uwDEST_Y(9,0)<4>       uwAVS_RESPONSE(5,8)<4;4,1>
+    mov (4) uwDEST_Y(12,0)<4>      uwAVS_RESPONSE(5,4)<4;4,1>
+    mov (4) uwDEST_Y(13,0)<4>      uwAVS_RESPONSE(5,12)<4;4,1>
+    mov (4) uwDEST_Y(16,0)<4>      uwAVS_RESPONSE(8,0)<4;4,1>
+    mov (4) uwDEST_Y(17,0)<4>      uwAVS_RESPONSE(8,8)<4;4,1>
+    mov (4) uwDEST_Y(20,0)<4>      uwAVS_RESPONSE(8,4)<4;4,1>
+    mov (4) uwDEST_Y(21,0)<4>      uwAVS_RESPONSE(8,12)<4;4,1>
+    mov (4) uwDEST_Y(24,0)<4>      uwAVS_RESPONSE(9,0)<4;4,1>
+    mov (4) uwDEST_Y(25,0)<4>      uwAVS_RESPONSE(9,8)<4;4,1>
+    mov (4) uwDEST_Y(28,0)<4>      uwAVS_RESPONSE(9,4)<4;4,1>
+    mov (4) uwDEST_Y(29,0)<4>      uwAVS_RESPONSE(9,12)<4;4,1>
+// Move first 8x8 words of V to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,2)<4>       uwAVS_RESPONSE(6,0)<4;4,1>
+    mov (4) uwDEST_Y(1,2)<4>       uwAVS_RESPONSE(6,8)<4;4,1>
+    mov (4) uwDEST_Y(4,2)<4>       uwAVS_RESPONSE(6,4)<4;4,1>
+    mov (4) uwDEST_Y(5,2)<4>       uwAVS_RESPONSE(6,12)<4;4,1>
+    mov (4) uwDEST_Y(8,2)<4>       uwAVS_RESPONSE(7,0)<4;4,1>
+    mov (4) uwDEST_Y(9,2)<4>       uwAVS_RESPONSE(7,8)<4;4,1>
+    mov (4) uwDEST_Y(12,2)<4>      uwAVS_RESPONSE(7,4)<4;4,1>
+    mov (4) uwDEST_Y(13,2)<4>      uwAVS_RESPONSE(7,12)<4;4,1>
+    mov (4) uwDEST_Y(16,2)<4>      uwAVS_RESPONSE(10,0)<4;4,1>
+    mov (4) uwDEST_Y(17,2)<4>      uwAVS_RESPONSE(10,8)<4;4,1>
+    mov (4) uwDEST_Y(20,2)<4>      uwAVS_RESPONSE(10,4)<4;4,1>
+    mov (4) uwDEST_Y(21,2)<4>      uwAVS_RESPONSE(10,12)<4;4,1>
+    mov (4) uwDEST_Y(24,2)<4>      uwAVS_RESPONSE(11,0)<4;4,1>
+    mov (4) uwDEST_Y(25,2)<4>      uwAVS_RESPONSE(11,8)<4;4,1>
+    mov (4) uwDEST_Y(28,2)<4>      uwAVS_RESPONSE(11,4)<4;4,1>
+    mov (4) uwDEST_Y(29,2)<4>      uwAVS_RESPONSE(11,12)<4;4,1>
+// Move first 8x8 words of A to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,3)<4>       0:uw
+    mov (4) uwDEST_Y(1,3)<4>       0:uw
+    mov (4) uwDEST_Y(4,3)<4>       0:uw
+    mov (4) uwDEST_Y(5,3)<4>       0:uw
+    mov (4) uwDEST_Y(8,3)<4>       0:uw
+    mov (4) uwDEST_Y(9,3)<4>       0:uw
+    mov (4) uwDEST_Y(12,3)<4>      0:uw
+    mov (4) uwDEST_Y(13,3)<4>      0:uw
+    mov (4) uwDEST_Y(16,3)<4>      0:uw
+    mov (4) uwDEST_Y(17,3)<4>      0:uw
+    mov (4) uwDEST_Y(20,3)<4>      0:uw
+    mov (4) uwDEST_Y(21,3)<4>      0:uw
+    mov (4) uwDEST_Y(24,3)<4>      0:uw
+    mov (4) uwDEST_Y(25,3)<4>      0:uw
+    mov (4) uwDEST_Y(28,3)<4>      0:uw
+    mov (4) uwDEST_Y(29,3)<4>      0:uw
+// Move second 8x8 words of Y to dest GRF
+    mov (4) uwDEST_Y(2,1)<4>       uwAVS_RESPONSE_2(0,0)<4;4,1>
+    mov (4) uwDEST_Y(3,1)<4>       uwAVS_RESPONSE_2(0,8)<4;4,1>
+    mov (4) uwDEST_Y(6,1)<4>       uwAVS_RESPONSE_2(0,4)<4;4,1>
+    mov (4) uwDEST_Y(7,1)<4>       uwAVS_RESPONSE_2(0,12)<4;4,1>
+    mov (4) uwDEST_Y(10,1)<4>      uwAVS_RESPONSE_2(1,0)<4;4,1>
+    mov (4) uwDEST_Y(11,1)<4>      uwAVS_RESPONSE_2(1,8)<4;4,1>
+    mov (4) uwDEST_Y(14,1)<4>      uwAVS_RESPONSE_2(1,4)<4;4,1>
+    mov (4) uwDEST_Y(15,1)<4>      uwAVS_RESPONSE_2(1,12)<4;4,1>
+    mov (4) uwDEST_Y(18,1)<4>      uwAVS_RESPONSE_2(2,0)<4;4,1>
+    mov (4) uwDEST_Y(19,1)<4>      uwAVS_RESPONSE_2(2,8)<4;4,1>
+    mov (4) uwDEST_Y(22,1)<4>      uwAVS_RESPONSE_2(2,4)<4;4,1>
+    mov (4) uwDEST_Y(23,1)<4>      uwAVS_RESPONSE_2(2,12)<4;4,1>
+    mov (4) uwDEST_Y(26,1)<4>      uwAVS_RESPONSE_2(3,0)<4;4,1>
+    mov (4) uwDEST_Y(27,1)<4>      uwAVS_RESPONSE_2(3,8)<4;4,1>
+    mov (4) uwDEST_Y(30,1)<4>      uwAVS_RESPONSE_2(3,4)<4;4,1>
+    mov (4) uwDEST_Y(31,1)<4>      uwAVS_RESPONSE_2(3,12)<4;4,1>
+// Move second 8x8 words of U to dest GRF
+    mov (4) uwDEST_Y(2,0)<4>       uwAVS_RESPONSE_2(4,0)<4;4,1>
+    mov (4) uwDEST_Y(3,0)<4>       uwAVS_RESPONSE_2(4,8)<4;4,1>
+    mov (4) uwDEST_Y(6,0)<4>       uwAVS_RESPONSE_2(4,4)<4;4,1>
+    mov (4) uwDEST_Y(7,0)<4>       uwAVS_RESPONSE_2(4,12)<4;4,1>
+    mov (4) uwDEST_Y(10,0)<4>      uwAVS_RESPONSE_2(5,0)<4;4,1>
+    mov (4) uwDEST_Y(11,0)<4>      uwAVS_RESPONSE_2(5,8)<4;4,1>
+    mov (4) uwDEST_Y(14,0)<4>      uwAVS_RESPONSE_2(5,4)<4;4,1>
+    mov (4) uwDEST_Y(15,0)<4>      uwAVS_RESPONSE_2(5,12)<4;4,1>
+    mov (4) uwDEST_Y(18,0)<4>      uwAVS_RESPONSE_2(8,0)<4;4,1>
+    mov (4) uwDEST_Y(19,0)<4>      uwAVS_RESPONSE_2(8,8)<4;4,1>
+    mov (4) uwDEST_Y(22,0)<4>      uwAVS_RESPONSE_2(8,4)<4;4,1>
+    mov (4) uwDEST_Y(23,0)<4>      uwAVS_RESPONSE_2(8,12)<4;4,1>
+    mov (4) uwDEST_Y(26,0)<4>      uwAVS_RESPONSE_2(9,0)<4;4,1>
+    mov (4) uwDEST_Y(27,0)<4>      uwAVS_RESPONSE_2(9,8)<4;4,1>
+    mov (4) uwDEST_Y(30,0)<4>      uwAVS_RESPONSE_2(9,4)<4;4,1>
+    mov (4) uwDEST_Y(31,0)<4>      uwAVS_RESPONSE_2(9,12)<4;4,1>
+// Move second 8x8 words of V to dest GRF
+    mov (4) uwDEST_Y(2,2)<4>       uwAVS_RESPONSE_2(6,0)<4;4,1>
+    mov (4) uwDEST_Y(3,2)<4>       uwAVS_RESPONSE_2(6,8)<4;4,1>
+    mov (4) uwDEST_Y(6,2)<4>       uwAVS_RESPONSE_2(6,4)<4;4,1>
+    mov (4) uwDEST_Y(7,2)<4>       uwAVS_RESPONSE_2(6,12)<4;4,1>
+    mov (4) uwDEST_Y(10,2)<4>      uwAVS_RESPONSE_2(7,0)<4;4,1>
+    mov (4) uwDEST_Y(11,2)<4>      uwAVS_RESPONSE_2(7,8)<4;4,1>
+    mov (4) uwDEST_Y(14,2)<4>      uwAVS_RESPONSE_2(7,4)<4;4,1>
+    mov (4) uwDEST_Y(15,2)<4>      uwAVS_RESPONSE_2(7,12)<4;4,1>
+    mov (4) uwDEST_Y(18,2)<4>      uwAVS_RESPONSE_2(10,0)<4;4,1>
+    mov (4) uwDEST_Y(19,2)<4>      uwAVS_RESPONSE_2(10,8)<4;4,1>
+    mov (4) uwDEST_Y(22,2)<4>      uwAVS_RESPONSE_2(10,4)<4;4,1>
+    mov (4) uwDEST_Y(23,2)<4>      uwAVS_RESPONSE_2(10,12)<4;4,1>
+    mov (4) uwDEST_Y(26,2)<4>      uwAVS_RESPONSE_2(11,0)<4;4,1>
+    mov (4) uwDEST_Y(27,2)<4>      uwAVS_RESPONSE_2(11,8)<4;4,1>
+    mov (4) uwDEST_Y(30,2)<4>      uwAVS_RESPONSE_2(11,4)<4;4,1>
+    mov (4) uwDEST_Y(31,2)<4>      uwAVS_RESPONSE_2(11,12)<4;4,1>
+// Move second 8x8 words of A to dest GRF
+    mov (4) uwDEST_Y(2,3)<4>       0:uw
+    mov (4) uwDEST_Y(3,3)<4>       0:uw
+    mov (4) uwDEST_Y(6,3)<4>       0:uw
+    mov (4) uwDEST_Y(7,3)<4>       0:uw
+    mov (4) uwDEST_Y(10,3)<4>      0:uw
+    mov (4) uwDEST_Y(11,3)<4>      0:uw
+    mov (4) uwDEST_Y(14,3)<4>      0:uw
+    mov (4) uwDEST_Y(15,3)<4>      0:uw
+    mov (4) uwDEST_Y(18,3)<4>      0:uw
+    mov (4) uwDEST_Y(19,3)<4>      0:uw
+    mov (4) uwDEST_Y(22,3)<4>      0:uw
+    mov (4) uwDEST_Y(23,3)<4>      0:uw
+    mov (4) uwDEST_Y(26,3)<4>      0:uw
+    mov (4) uwDEST_Y(27,3)<4>      0:uw
+    mov (4) uwDEST_Y(30,3)<4>      0:uw
+    mov (4) uwDEST_Y(31,3)<4>      0:uw
+/*      This section will be used if 16-bit output is needed in planar format -vK
+     // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>        uwAVS_RESPONSE(%1,0)<8;4,1>
+        mov (8) uwDEST_Y(%1*2+1)<1>      uwAVS_RESPONSE(%1,8)<8;4,1>
+    }
+    // Move 1st 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>           uwAVS_RESPONSE(4,0)<8;4,1>
+    mov (8) uwDEST_U(1)<1>           uwAVS_RESPONSE(4,8)<8;4,1>
+    mov (8) uwDEST_U(2)<1>           uwAVS_RESPONSE(5,0)<8;4,1>
+    mov (8) uwDEST_U(3)<1>           uwAVS_RESPONSE(5,8)<8;4,1>
+    mov (8) uwDEST_U(4)<1>           uwAVS_RESPONSE(8,0)<8;4,1>
+    mov (8) uwDEST_U(5)<1>           uwAVS_RESPONSE(8,8)<8;4,1>
+    mov (8) uwDEST_U(6)<1>           uwAVS_RESPONSE(9,0)<8;4,1>
+    mov (8) uwDEST_U(7)<1>           uwAVS_RESPONSE(9,8)<8;4,1>
+    // Move 1st 8x8 words of V to dest GRF
+    mov (8) uwDEST_V(0)<1>           uwAVS_RESPONSE(6,0)<8;4,1>
+    mov (8) uwDEST_V(1)<1>           uwAVS_RESPONSE(6,8)<8;4,1>
+    mov (8) uwDEST_V(2)<1>           uwAVS_RESPONSE(7,0)<8;4,1>
+    mov (8) uwDEST_V(3)<1>           uwAVS_RESPONSE(7,8)<8;4,1>
+    mov (8) uwDEST_V(4)<1>           uwAVS_RESPONSE(10,0)<8;4,1>
+    mov (8) uwDEST_V(5)<1>           uwAVS_RESPONSE(10,8)<8;4,1>
+    mov (8) uwDEST_V(6)<1>           uwAVS_RESPONSE(11,0)<8;4,1>
+    mov (8) uwDEST_V(7)<1>           uwAVS_RESPONSE(11,8)<8;4,1>
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>      uwAVS_RESPONSE_2(%1,0)<8;4,1>
+        mov (8) uwDEST_Y(%1*2+1,8)<1>    uwAVS_RESPONSE_2(%1,8)<8;4,1>
+    }
+    // Move 2st 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0,8)<1>         uwAVS_RESPONSE_2(4,0)<8;4,1>
+    mov (8) uwDEST_U(1,8)<1>         uwAVS_RESPONSE_2(4,8)<8;4,1>
+    mov (8) uwDEST_U(2,8)<1>         uwAVS_RESPONSE_2(5,0)<8;4,1>
+    mov (8) uwDEST_U(3,8)<1>         uwAVS_RESPONSE_2(5,8)<8;4,1>
+    mov (8) uwDEST_U(4,8)<1>         uwAVS_RESPONSE_2(8,0)<8;4,1>
+    mov (8) uwDEST_U(5,8)<1>         uwAVS_RESPONSE_2(8,8)<8;4,1>
+    mov (8) uwDEST_U(6,8)<1>         uwAVS_RESPONSE_2(9,0)<8;4,1>
+    mov (8) uwDEST_U(7,8)<1>         uwAVS_RESPONSE_2(9,8)<8;4,1>
+    // Move 2st 8x8 words of V to dest GRF
+    mov (8) uwDEST_V(0,8)<1>         uwAVS_RESPONSE_2(6,0)<8;4,1>
+    mov (8) uwDEST_V(1,8)<1>         uwAVS_RESPONSE_2(6,8)<8;4,1>
+    mov (8) uwDEST_V(2,8)<1>         uwAVS_RESPONSE_2(7,0)<8;4,1>
+    mov (8) uwDEST_V(3,8)<1>         uwAVS_RESPONSE_2(7,8)<8;4,1>
+    mov (8) uwDEST_V(4,8)<1>         uwAVS_RESPONSE_2(10,0)<8;4,1>
+    mov (8) uwDEST_V(5,8)<1>         uwAVS_RESPONSE_2(10,8)<8;4,1>
+    mov (8) uwDEST_V(6,8)<1>         uwAVS_RESPONSE_2(11,0)<8;4,1>
+    mov (8) uwDEST_V(7,8)<1>         uwAVS_RESPONSE_2(11,8)<8;4,1>
+*/
+#else
+    // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>        ubAVS_RESPONSE(%1,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>      ubAVS_RESPONSE(%1,8+1)<16;4,2>    // Copy high byte in a word
+    }
+    // Move 1st 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>           ubAVS_RESPONSE(4,1)<16;4,2>
+    mov (8) uwDEST_U(1)<1>           ubAVS_RESPONSE(4,8+1)<16;4,2>
+    mov (8) uwDEST_U(2)<1>           ubAVS_RESPONSE(5,1)<16;4,2>
+    mov (8) uwDEST_U(3)<1>           ubAVS_RESPONSE(5,8+1)<16;4,2>
+    mov (8) uwDEST_U(4)<1>           ubAVS_RESPONSE(8,1)<16;4,2>
+    mov (8) uwDEST_U(5)<1>           ubAVS_RESPONSE(8,8+1)<16;4,2>
+    mov (8) uwDEST_U(6)<1>           ubAVS_RESPONSE(9,1)<16;4,2>
+    mov (8) uwDEST_U(7)<1>           ubAVS_RESPONSE(9,8+1)<16;4,2>
+    // Move 1st 8x8 words of V to dest GRF
+    mov (8) uwDEST_V(0)<1>           ubAVS_RESPONSE(6,1)<16;4,2>
+    mov (8) uwDEST_V(1)<1>           ubAVS_RESPONSE(6,8+1)<16;4,2>
+    mov (8) uwDEST_V(2)<1>           ubAVS_RESPONSE(7,1)<16;4,2>
+    mov (8) uwDEST_V(3)<1>           ubAVS_RESPONSE(7,8+1)<16;4,2>
+    mov (8) uwDEST_V(4)<1>           ubAVS_RESPONSE(10,1)<16;4,2>
+    mov (8) uwDEST_V(5)<1>           ubAVS_RESPONSE(10,8+1)<16;4,2>
+    mov (8) uwDEST_V(6)<1>           ubAVS_RESPONSE(11,1)<16;4,2>
+    mov (8) uwDEST_V(7)<1>           ubAVS_RESPONSE(11,8+1)<16;4,2>
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>      ubAVS_RESPONSE_2(%1,1)<16;4,2>    // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>    ubAVS_RESPONSE_2(%1,8+1)<16;4,2>  // Copy high byte in a word
+    }
+    // Move 2st 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0,8)<1>         ubAVS_RESPONSE_2(4,1)<16;4,2>
+    mov (8) uwDEST_U(1,8)<1>         ubAVS_RESPONSE_2(4,8+1)<16;4,2>
+    mov (8) uwDEST_U(2,8)<1>         ubAVS_RESPONSE_2(5,1)<16;4,2>
+    mov (8) uwDEST_U(3,8)<1>         ubAVS_RESPONSE_2(5,8+1)<16;4,2>
+    mov (8) uwDEST_U(4,8)<1>         ubAVS_RESPONSE_2(8,1)<16;4,2>
+    mov (8) uwDEST_U(5,8)<1>         ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) uwDEST_U(6,8)<1>         ubAVS_RESPONSE_2(9,1)<16;4,2>
+    mov (8) uwDEST_U(7,8)<1>         ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+    // Move 2st 8x8 words of V to dest GRF
+    mov (8) uwDEST_V(0,8)<1>         ubAVS_RESPONSE_2(6,1)<16;4,2>
+    mov (8) uwDEST_V(1,8)<1>         ubAVS_RESPONSE_2(6,8+1)<16;4,2>
+    mov (8) uwDEST_V(2,8)<1>         ubAVS_RESPONSE_2(7,1)<16;4,2>
+    mov (8) uwDEST_V(3,8)<1>         ubAVS_RESPONSE_2(7,8+1)<16;4,2>
+    mov (8) uwDEST_V(4,8)<1>         ubAVS_RESPONSE_2(10,1)<16;4,2>
+    mov (8) uwDEST_V(5,8)<1>         ubAVS_RESPONSE_2(10,8+1)<16;4,2>
+    mov (8) uwDEST_V(6,8)<1>         ubAVS_RESPONSE_2(11,1)<16;4,2>
+    mov (8) uwDEST_V(7,8)<1>         ubAVS_RESPONSE_2(11,8+1)<16;4,2>
+#endif
+       // Re-define new # of lines
+       #undef nUV_NUM_OF_ROWS
+       #undef nY_NUM_OF_ROWS
+       #define nY_NUM_OF_ROWS      8
+       #define nUV_NUM_OF_ROWS     8

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL2_AVS_IEF_Unpack_8x4.asm
 ,0 → 1,45
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_8x4.asm ----------
+    // Move first 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>        ubAVS_RESPONSE(%1,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>      ubAVS_RESPONSE(%1,8+1)<16;4,2>    // Copy high byte in a word
+    }
+    // Move 8x4 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>            ubAVS_RESPONSE(4,1)<16;4,2>
+    mov (8) uwDEST_U(0,8)<1>          ubAVS_RESPONSE(5,1)<16;4,2>
+    mov (8) uwDEST_U(1)<1>            ubAVS_RESPONSE(8,1)<16;4,2>
+    mov (8) uwDEST_U(1,8)<1>          ubAVS_RESPONSE(9,1)<16;4,2>
+    // Move 8x4 words of V to dest GRF
+    mov (8) uwDEST_V(0)<1>            ubAVS_RESPONSE(6,1)<16;4,2>
+    mov (8) uwDEST_V(0,8)<1>          ubAVS_RESPONSE(7,1)<16;4,2>
+    mov (8) uwDEST_V(1)<1>            ubAVS_RESPONSE(10,1)<16;4,2>
+    mov (8) uwDEST_V(1,8)<1>          ubAVS_RESPONSE(11,1)<16;4,2>
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>      ubAVS_RESPONSE_2(%1,1)<16;4,2>    // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>    ubAVS_RESPONSE_2(%1,8+1)<16;4,2>  // Copy high byte in a word
+    }
+//------------------------------------------------------------------------------
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     4

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL2_AVS_IEF_Unpack_8x8.asm
 ,0 → 1,53
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_8x8.asm ----------
+    // Move first 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>        ubAVS_RESPONSE(%1,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>      ubAVS_RESPONSE(%1,8+1)<16;4,2>    // Copy high byte in a word
+    }
+    // Move 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>            ubAVS_RESPONSE(4,1)<16;4,2>
+    mov (8) uwDEST_U(0,8)<1>          ubAVS_RESPONSE(4,8+1)<16;4,2>
+    mov (8) uwDEST_U(1)<1>            ubAVS_RESPONSE(5,1)<16;4,2>
+    mov (8) uwDEST_U(1,8)<1>          ubAVS_RESPONSE(5,8+1)<16;4,2>
+    mov (8) uwDEST_U(2)<1>            ubAVS_RESPONSE(8,1)<16;4,2>
+    mov (8) uwDEST_U(2,8)<1>          ubAVS_RESPONSE(8,8+1)<16;4,2>
+    mov (8) uwDEST_U(3)<1>            ubAVS_RESPONSE(9,1)<16;4,2>
+    mov (8) uwDEST_U(3,8)<1>          ubAVS_RESPONSE(9,8+1)<16;4,2>
+    // Move 8x8 words of V to dest GRF
+    mov (8) uwDEST_V(0)<1>            ubAVS_RESPONSE(6,1)<16;4,2>
+    mov (8) uwDEST_V(0,8)<1>          ubAVS_RESPONSE(6,8+1)<16;4,2>
+    mov (8) uwDEST_V(1)<1>            ubAVS_RESPONSE(7,1)<16;4,2>
+    mov (8) uwDEST_V(1,8)<1>          ubAVS_RESPONSE(7,8+1)<16;4,2>
+    mov (8) uwDEST_V(2)<1>            ubAVS_RESPONSE(10,1)<16;4,2>
+    mov (8) uwDEST_V(2,8)<1>          ubAVS_RESPONSE(10,8+1)<16;4,2>
+    mov (8) uwDEST_V(3)<1>            ubAVS_RESPONSE(11,1)<16;4,2>
+    mov (8) uwDEST_V(3,8)<1>          ubAVS_RESPONSE(11,8+1)<16;4,2>
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>      ubAVS_RESPONSE_2(%1,1)<16;4,2>    // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>    ubAVS_RESPONSE_2(%1,8+1)<16;4,2>  // Copy high byte in a word
+    }
+//------------------------------------------------------------------------------
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL2_Scaling.asm
 ,0 → 1,71
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_Scaling.asm ----------
+#include "Scaling.inc"
+        // Build 16 elements ramp in float32 and normalized it
+//      mov (8)         SAMPLER_RAMP(0)<1>              0x76543210:v
+//      add     (8)             SAMPLER_RAMP(1)<1>              SAMPLER_RAMP(0) 8.0:f
+mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf                //3, 2, 1, 0 in float vector
+mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf      //7, 6, 5, 4 in float vector
+add     (8)             SAMPLER_RAMP(1)<1>              SAMPLER_RAMP(0) 8.0:f
+//Module: PrepareScaleCoord.asm
+        // Setup for sampler msg hdr
+    mov (2)             rMSGSRC.0<1>:ud                 0:ud                                            { NoDDClr }     // Unused fields
+    mov (1)             rMSGSRC.2<1>:ud                 0:ud                                            { NoDDChk }     // Write and offset
+        // Calculate 16 v based on the step Y and vertical origin
+        mov     (16)    mfMSGPAYLOAD(2)<1>              fSRC_VID_V_ORI<0;1,0>:f
+        mov     (16)    SCALE_COORD_Y<1>:f              fSRC_VID_V_ORI<0;1,0>:f
+        // Calculate 16 u based on the step X and hori origin
+//      line (16)       mfMSGPAYLOAD(0)<1>              SCALE_STEP_X<0;1,0>:f           SAMPLER_RAMP(0)         // Assign to mrf directly
+        mov     (16)    acc0:f                                                  fSRC_VID_H_ORI<0;1,0>:f                                                                                 { Compr }
+        mac     (16)    mfMSGPAYLOAD(0)<1>      fVIDEO_STEP_X<0;1,0>:f  SAMPLER_RAMP(0)                 { Compr }
+        //Setup the constants for line instruction
+        mov     (1)             SCALE_LINE_P255<1>:f            255.0:f                         { NoDDClr }     //{ NoDDClr, NoDDChk }
+        mov     (1)             SCALE_LINE_P0_5<1>:f            0.5:f                           { NoDDChk }
+//------------------------------------------------------------------------------
+$for (0; <nY_NUM_OF_ROWS; 1) {
+        // Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
+  mov (8)       MSGHDR_SCALE.0:ud      rMSGSRC.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+        send (16)       SCALE_RESPONSE_YW(0)<1>         MSGHDR_SCALE    udDUMMY_NULL    nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_Y+nBI_CURRENT_SRC_Y
+        send (16)       SCALE_RESPONSE_UW(0)<1>         MSGHDR_SCALE    udDUMMY_NULL    nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_UV+nBI_CURRENT_SRC_UV
+        // Calculate 16 v for next line
+        add (16)        mfMSGPAYLOAD(2)<1>              SCALE_COORD_Y<8;8,1>:f          fVIDEO_STEP_Y<0;1,0>:f  // Assign to mrf directly
+        add (16)        SCALE_COORD_Y<1>:f              SCALE_COORD_Y<8;8,1>:f          fVIDEO_STEP_Y<0;1,0>:f  // Assign to mrf directly
+        // Scale back to [0, 255], convert f to ud
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_YF(0)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_YD(0)<1>       acc0:f                                                                                                          { Compr }
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_UF(0)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_UD(0)<1>       acc0:f                                                                                                          { Compr }
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_UF(2)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_UD(2)<1>       acc0:f                                                                                                          { Compr }
+        mov      (16)   DEST_Y(%1)<1>                           SCALE_RESPONSE_YB(0)                                                                                    //possible error due to truncation - vK
+        mov      (16)   DEST_U(%1)<1>                           SCALE_RESPONSE_UB(0)                                                                                    //possible error due to truncation - vK
+        mov      (16)   DEST_V(%1)<1>                           SCALE_RESPONSE_UB(2)                                                                                    //possible error due to truncation - vK
+}
+        #define nSRC_REGION                             nREGION_1
+//------------------------------------------------------------------------------

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL3_AVS_IEF_16x8.asm
 ,0 → 1,69
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL3_AVS_IEF_16x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y surface
+// 2 sampler read  for 8x8 U surface
+// 2 sampler read  for 8x8 V surface
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // 1st 8x8 Y sampling
+    mov  (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>  mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+    // 1st 8x8 U sampling
+    mov  (1) rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Enable red channel
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1>     mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U
+    // Return U in 4 GRFs
+    // 1st 8x8 V sampling
+    mov  (1) rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Dummy instruction to avoid back-2-back send instructions
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(8)<1>     mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V
+    // Return V in 4 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    // 2nd 8x8 Y sampling
+    mov  (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+    // 2nd 8x8 U sampling
+    mov  (1) rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Enable red channel
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(4)<1>     mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U
+    // Return U in 4 GRFs
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Dummy instruction just in order to avoid back-2-back send instructions!
+    // 2nd 8x8 V sampling
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(8)<1>     mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V
+    // Return V in 4 GRFs
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:4:4 internal planar
+//------------------------------------------------------------------------------
+    #include "PL3_AVS_IEF_Unpack_16x8.asm"

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL3_AVS_IEF_8x4.asm
 ,0 → 1,60
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL3_AVS_IEF_8x4.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y surface
+// 1 sampler read  for 8x8 U surface
+// 1 sampler read  for 8x8 V surface
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // 1st 8x8 Y sampling
+    mov (1) rAVS_8x8_HDR.2:ud       nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+    // 8x8 U sampling ; Only 8x4 will be used
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Enable red channel
+    mul (1)  rAVS_PAYLOAD.1:f       fVIDEO_STEP_X:f    2.0:f    // Calculate Step X for chroma
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1>   mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U
+    // Return U in 4 GRFs
+    // 8x8 V sampling ; Only 8x4 will be used
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Dummy instruction just in order to avoid back-2-back send instructions!
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(8)<1>   mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V
+    // Return V in 4 GRFs
+   // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    // 2nd 8x8 Y sampling
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (1)  rAVS_PAYLOAD.1:f       fVIDEO_STEP_X:f             // Restore Step X for luma
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(12)<1>  mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:2:0 internal planar
+//------------------------------------------------------------------------------
+    #include "PL3_AVS_IEF_Unpack_8x4.asm"

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL3_AVS_IEF_8x8.asm
 ,0 → 1,60
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL3_AVS_IEF_8x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y surface
+// 1 sampler read  for 8x8 U surface
+// 1 sampler read  for 8x8 V surface
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // 1st 8x8 Y sampling
+    mov (1) rAVS_8x8_HDR.2:ud       nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+    // 8x8 U sampling
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Enable red channel
+    mul (1)  rAVS_PAYLOAD.1:f       fVIDEO_STEP_X:f    2.0:f    // Calculate Step X for chroma
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1>   mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U
+    // Return U in 4 GRFs
+    // 8x8 V sampling
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Dummy instruction just in order to avoid back-2-back send instructions!
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(8)<1>   mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V
+    // Return V in 4 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    // 2nd 8x8 Y sampling
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (1)  rAVS_PAYLOAD.1:f       fVIDEO_STEP_X:f             // Restore Step X for luma
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(12)<1>  mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:2:2 internal planar
+//------------------------------------------------------------------------------
+    #include "PL3_AVS_IEF_Unpack_8x8.asm"

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL3_AVS_IEF_Unpack_16x8.asm
 ,0 → 1,240
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL3_AVS_IEF_Unpack_16x8.asm ----------
+#ifdef AVS_OUTPUT_16_BIT        //Output is packed in AVYU format
+// Move first 8x8 words of Y to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,1)<4>       uwAVS_RESPONSE(0,0)<4;4,1>
+    mov (4) uwDEST_Y(1,1)<4>       uwAVS_RESPONSE(0,8)<4;4,1>
+    mov (4) uwDEST_Y(4,1)<4>       uwAVS_RESPONSE(0,4)<4;4,1>
+    mov (4) uwDEST_Y(5,1)<4>       uwAVS_RESPONSE(0,12)<4;4,1>
+    mov (4) uwDEST_Y(8,1)<4>       uwAVS_RESPONSE(1,0)<4;4,1>
+    mov (4) uwDEST_Y(9,1)<4>       uwAVS_RESPONSE(1,8)<4;4,1>
+    mov (4) uwDEST_Y(12,1)<4>      uwAVS_RESPONSE(1,4)<4;4,1>
+    mov (4) uwDEST_Y(13,1)<4>      uwAVS_RESPONSE(1,12)<4;4,1>
+    mov (4) uwDEST_Y(16,1)<4>      uwAVS_RESPONSE(2,0)<4;4,1>
+    mov (4) uwDEST_Y(17,1)<4>      uwAVS_RESPONSE(2,8)<4;4,1>
+    mov (4) uwDEST_Y(20,1)<4>      uwAVS_RESPONSE(2,4)<4;4,1>
+    mov (4) uwDEST_Y(21,1)<4>      uwAVS_RESPONSE(2,12)<4;4,1>
+    mov (4) uwDEST_Y(24,1)<4>      uwAVS_RESPONSE(3,0)<4;4,1>
+    mov (4) uwDEST_Y(25,1)<4>      uwAVS_RESPONSE(3,8)<4;4,1>
+    mov (4) uwDEST_Y(28,1)<4>      uwAVS_RESPONSE(3,4)<4;4,1>
+    mov (4) uwDEST_Y(29,1)<4>      uwAVS_RESPONSE(3,12)<4;4,1>
+// Move first 8x8 words of U to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,0)<4>       uwAVS_RESPONSE(4,0)<4;4,1>
+    mov (4) uwDEST_Y(1,0)<4>       uwAVS_RESPONSE(4,8)<4;4,1>
+    mov (4) uwDEST_Y(4,0)<4>       uwAVS_RESPONSE(4,4)<4;4,1>
+    mov (4) uwDEST_Y(5,0)<4>       uwAVS_RESPONSE(4,12)<4;4,1>
+    mov (4) uwDEST_Y(8,0)<4>       uwAVS_RESPONSE(5,0)<4;4,1>
+    mov (4) uwDEST_Y(9,0)<4>       uwAVS_RESPONSE(5,8)<4;4,1>
+    mov (4) uwDEST_Y(12,0)<4>      uwAVS_RESPONSE(5,4)<4;4,1>
+    mov (4) uwDEST_Y(13,0)<4>      uwAVS_RESPONSE(5,12)<4;4,1>
+    mov (4) uwDEST_Y(16,0)<4>      uwAVS_RESPONSE(6,0)<4;4,1>
+    mov (4) uwDEST_Y(17,0)<4>      uwAVS_RESPONSE(6,8)<4;4,1>
+    mov (4) uwDEST_Y(20,0)<4>      uwAVS_RESPONSE(6,4)<4;4,1>
+    mov (4) uwDEST_Y(21,0)<4>      uwAVS_RESPONSE(6,12)<4;4,1>
+    mov (4) uwDEST_Y(24,0)<4>      uwAVS_RESPONSE(7,0)<4;4,1>
+    mov (4) uwDEST_Y(25,0)<4>      uwAVS_RESPONSE(7,8)<4;4,1>
+    mov (4) uwDEST_Y(28,0)<4>      uwAVS_RESPONSE(7,4)<4;4,1>
+    mov (4) uwDEST_Y(29,0)<4>      uwAVS_RESPONSE(7,12)<4;4,1>
+// Move first 8x8 words of V to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,2)<4>       uwAVS_RESPONSE(8,0)<4;4,1>
+    mov (4) uwDEST_Y(1,2)<4>       uwAVS_RESPONSE(8,8)<4;4,1>
+    mov (4) uwDEST_Y(4,2)<4>       uwAVS_RESPONSE(8,4)<4;4,1>
+    mov (4) uwDEST_Y(5,2)<4>       uwAVS_RESPONSE(8,12)<4;4,1>
+    mov (4) uwDEST_Y(8,2)<4>       uwAVS_RESPONSE(9,0)<4;4,1>
+    mov (4) uwDEST_Y(9,2)<4>       uwAVS_RESPONSE(9,8)<4;4,1>
+    mov (4) uwDEST_Y(12,2)<4>      uwAVS_RESPONSE(9,4)<4;4,1>
+    mov (4) uwDEST_Y(13,2)<4>      uwAVS_RESPONSE(9,12)<4;4,1>
+    mov (4) uwDEST_Y(16,2)<4>      uwAVS_RESPONSE(10,0)<4;4,1>
+    mov (4) uwDEST_Y(17,2)<4>      uwAVS_RESPONSE(10,8)<4;4,1>
+    mov (4) uwDEST_Y(20,2)<4>      uwAVS_RESPONSE(10,4)<4;4,1>
+    mov (4) uwDEST_Y(21,2)<4>      uwAVS_RESPONSE(10,12)<4;4,1>
+    mov (4) uwDEST_Y(24,2)<4>      uwAVS_RESPONSE(11,0)<4;4,1>
+    mov (4) uwDEST_Y(25,2)<4>      uwAVS_RESPONSE(11,8)<4;4,1>
+    mov (4) uwDEST_Y(28,2)<4>      uwAVS_RESPONSE(11,4)<4;4,1>
+    mov (4) uwDEST_Y(29,2)<4>      uwAVS_RESPONSE(11,12)<4;4,1>
+// Move first 8x8 words of A to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,3)<4>       0:uw
+    mov (4) uwDEST_Y(1,3)<4>       0:uw
+    mov (4) uwDEST_Y(4,3)<4>       0:uw
+    mov (4) uwDEST_Y(5,3)<4>       0:uw
+    mov (4) uwDEST_Y(8,3)<4>       0:uw
+    mov (4) uwDEST_Y(9,3)<4>       0:uw
+    mov (4) uwDEST_Y(12,3)<4>      0:uw
+    mov (4) uwDEST_Y(13,3)<4>      0:uw
+    mov (4) uwDEST_Y(16,3)<4>      0:uw
+    mov (4) uwDEST_Y(17,3)<4>      0:uw
+    mov (4) uwDEST_Y(20,3)<4>      0:uw
+    mov (4) uwDEST_Y(21,3)<4>      0:uw
+    mov (4) uwDEST_Y(24,3)<4>      0:uw
+    mov (4) uwDEST_Y(25,3)<4>      0:uw
+    mov (4) uwDEST_Y(28,3)<4>      0:uw
+    mov (4) uwDEST_Y(29,3)<4>      0:uw
+// Move second 8x8 words of Y to dest GRF
+    mov (4) uwDEST_Y(2,1)<4>       uwAVS_RESPONSE_2(0,0)<4;4,1>
+    mov (4) uwDEST_Y(3,1)<4>       uwAVS_RESPONSE_2(0,8)<4;4,1>
+    mov (4) uwDEST_Y(6,1)<4>       uwAVS_RESPONSE_2(0,4)<4;4,1>
+    mov (4) uwDEST_Y(7,1)<4>       uwAVS_RESPONSE_2(0,12)<4;4,1>
+    mov (4) uwDEST_Y(10,1)<4>      uwAVS_RESPONSE_2(1,0)<4;4,1>
+    mov (4) uwDEST_Y(11,1)<4>      uwAVS_RESPONSE_2(1,8)<4;4,1>
+    mov (4) uwDEST_Y(14,1)<4>      uwAVS_RESPONSE_2(1,4)<4;4,1>
+    mov (4) uwDEST_Y(15,1)<4>      uwAVS_RESPONSE_2(1,12)<4;4,1>
+    mov (4) uwDEST_Y(18,1)<4>      uwAVS_RESPONSE_2(2,0)<4;4,1>
+    mov (4) uwDEST_Y(19,1)<4>      uwAVS_RESPONSE_2(2,8)<4;4,1>
+    mov (4) uwDEST_Y(22,1)<4>      uwAVS_RESPONSE_2(2,4)<4;4,1>
+    mov (4) uwDEST_Y(23,1)<4>      uwAVS_RESPONSE_2(2,12)<4;4,1>
+    mov (4) uwDEST_Y(26,1)<4>      uwAVS_RESPONSE_2(3,0)<4;4,1>
+    mov (4) uwDEST_Y(27,1)<4>      uwAVS_RESPONSE_2(3,8)<4;4,1>
+    mov (4) uwDEST_Y(30,1)<4>      uwAVS_RESPONSE_2(3,4)<4;4,1>
+    mov (4) uwDEST_Y(31,1)<4>      uwAVS_RESPONSE_2(3,12)<4;4,1>
+// Move second 8x8 words of U to dest GRF
+    mov (4) uwDEST_Y(2,0)<4>       uwAVS_RESPONSE_2(4,0)<4;4,1>
+    mov (4) uwDEST_Y(3,0)<4>       uwAVS_RESPONSE_2(4,8)<4;4,1>
+    mov (4) uwDEST_Y(6,0)<4>       uwAVS_RESPONSE_2(4,4)<4;4,1>
+    mov (4) uwDEST_Y(7,0)<4>       uwAVS_RESPONSE_2(4,12)<4;4,1>
+    mov (4) uwDEST_Y(10,0)<4>      uwAVS_RESPONSE_2(5,0)<4;4,1>
+    mov (4) uwDEST_Y(11,0)<4>      uwAVS_RESPONSE_2(5,8)<4;4,1>
+    mov (4) uwDEST_Y(14,0)<4>      uwAVS_RESPONSE_2(5,4)<4;4,1>
+    mov (4) uwDEST_Y(15,0)<4>      uwAVS_RESPONSE_2(5,12)<4;4,1>
+    mov (4) uwDEST_Y(18,0)<4>      uwAVS_RESPONSE_2(6,0)<4;4,1>
+    mov (4) uwDEST_Y(19,0)<4>      uwAVS_RESPONSE_2(6,8)<4;4,1>
+    mov (4) uwDEST_Y(22,0)<4>      uwAVS_RESPONSE_2(6,4)<4;4,1>
+    mov (4) uwDEST_Y(23,0)<4>      uwAVS_RESPONSE_2(6,12)<4;4,1>
+    mov (4) uwDEST_Y(26,0)<4>      uwAVS_RESPONSE_2(7,0)<4;4,1>
+    mov (4) uwDEST_Y(27,0)<4>      uwAVS_RESPONSE_2(7,8)<4;4,1>
+    mov (4) uwDEST_Y(30,0)<4>      uwAVS_RESPONSE_2(7,4)<4;4,1>
+    mov (4) uwDEST_Y(31,0)<4>      uwAVS_RESPONSE_2(7,12)<4;4,1>
+// Move second 8x8 words of V to dest GRF
+    mov (4) uwDEST_Y(2,2)<4>       uwAVS_RESPONSE_2(8,0)<4;4,1>
+    mov (4) uwDEST_Y(3,2)<4>       uwAVS_RESPONSE_2(8,8)<4;4,1>
+    mov (4) uwDEST_Y(6,2)<4>       uwAVS_RESPONSE_2(8,4)<4;4,1>
+    mov (4) uwDEST_Y(7,2)<4>       uwAVS_RESPONSE_2(8,12)<4;4,1>
+    mov (4) uwDEST_Y(10,2)<4>      uwAVS_RESPONSE_2(9,0)<4;4,1>
+    mov (4) uwDEST_Y(11,2)<4>      uwAVS_RESPONSE_2(9,8)<4;4,1>
+    mov (4) uwDEST_Y(14,2)<4>      uwAVS_RESPONSE_2(9,4)<4;4,1>
+    mov (4) uwDEST_Y(15,2)<4>      uwAVS_RESPONSE_2(9,12)<4;4,1>
+    mov (4) uwDEST_Y(18,2)<4>      uwAVS_RESPONSE_2(10,0)<4;4,1>
+    mov (4) uwDEST_Y(19,2)<4>      uwAVS_RESPONSE_2(10,8)<4;4,1>
+    mov (4) uwDEST_Y(22,2)<4>      uwAVS_RESPONSE_2(10,4)<4;4,1>
+    mov (4) uwDEST_Y(23,2)<4>      uwAVS_RESPONSE_2(10,12)<4;4,1>
+    mov (4) uwDEST_Y(26,2)<4>      uwAVS_RESPONSE_2(11,0)<4;4,1>
+    mov (4) uwDEST_Y(27,2)<4>      uwAVS_RESPONSE_2(11,8)<4;4,1>
+    mov (4) uwDEST_Y(30,2)<4>      uwAVS_RESPONSE_2(11,4)<4;4,1>
+    mov (4) uwDEST_Y(31,2)<4>      uwAVS_RESPONSE_2(11,12)<4;4,1>
+// Move second 8x8 words of A to dest GRF
+    mov (4) uwDEST_Y(2,3)<4>       0:uw
+    mov (4) uwDEST_Y(3,3)<4>       0:uw
+    mov (4) uwDEST_Y(6,3)<4>       0:uw
+    mov (4) uwDEST_Y(7,3)<4>       0:uw
+    mov (4) uwDEST_Y(10,3)<4>      0:uw
+    mov (4) uwDEST_Y(11,3)<4>      0:uw
+    mov (4) uwDEST_Y(14,3)<4>      0:uw
+    mov (4) uwDEST_Y(15,3)<4>      0:uw
+    mov (4) uwDEST_Y(18,3)<4>      0:uw
+    mov (4) uwDEST_Y(19,3)<4>      0:uw
+    mov (4) uwDEST_Y(22,3)<4>      0:uw
+    mov (4) uwDEST_Y(23,3)<4>      0:uw
+    mov (4) uwDEST_Y(26,3)<4>      0:uw
+    mov (4) uwDEST_Y(27,3)<4>      0:uw
+    mov (4) uwDEST_Y(30,3)<4>      0:uw
+    mov (4) uwDEST_Y(31,3)<4>      0:uw
+/*      This section will be used if 16-bit output is needed in planar format -vK
+    // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>          uwAVS_RESPONSE(%1)<8;4,1>
+        mov (8) uwDEST_Y(%1*2+1)<1>        uwAVS_RESPONSE(%1,8)<8;4,1>
+    }
+    // Move 8x8 words of U to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_U(%1*2)<1>          uwAVS_RESPONSE(%1+4)<8;4,1>
+        mov (8) uwDEST_U(%1*2+1)<1>        uwAVS_RESPONSE(%1+4,8)<8;4,1>
+    }
+    // Move 8x8 words of V to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_V(%1*2)<1>          uwAVS_RESPONSE(%1+8)<8;4,1>
+        mov (8) uwDEST_V(%1*2+1)<1>        uwAVS_RESPONSE(%1+8,8)<8;4,1>
+    }
+    // Move 2nd 8x8 words of Y to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>          uwAVS_RESPONSE_2(%1)<8;4,1>
+        mov (8) uwDEST_Y(%1*2+1,8)<1>        uwAVS_RESPONSE_2(%1,8)<8;4,1>
+    }
+    // Move 2nd 8x8 words of U to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_U(%1*2,8)<1>          uwAVS_RESPONSE_2(%1+4)<8;4,1>
+        mov (8) uwDEST_U(%1*2+1,8)<1>        uwAVS_RESPONSE_2(%1+4,8)<8;4,1>
+    }
+    // Move 2nd 8x8 words of V to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_V(%1*2,8)<1>          uwAVS_RESPONSE_2(%1+8)<8;4,1>
+        mov (8) uwDEST_V(%1*2+1,8)<1>        uwAVS_RESPONSE_2(%1+8,8)<8;4,1>
+    }
+*/
+#else /* OUTPUT_8_BIT */
+    // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>          ubAVS_RESPONSE(%1,1)<16;4,2>        // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>        ubAVS_RESPONSE(%1,8+1)<16;4,2>      // Copy high byte in a word
+    }
+    // Move 8x8 words of U to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_U(%1*2)<1>          ubAVS_RESPONSE(%1+4,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_U(%1*2+1)<1>        ubAVS_RESPONSE(%1+4,8+1)<16;4,2>    // Copy high byte in a word
+    }
+    // Move 8x8 words of V to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_V(%1*2)<1>          ubAVS_RESPONSE(%1+8,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_V(%1*2+1)<1>        ubAVS_RESPONSE(%1+8,8+1)<16;4,2>    // Copy high byte in a word
+    }
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>          ubAVS_RESPONSE_2(%1,1)<16;4,2>     // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>        ubAVS_RESPONSE_2(%1,8+1)<16;4,2>   // Copy high byte in a word
+    }
+    // Move 2nd 8x8 words of U to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_U(%1*2,8)<1>          ubAVS_RESPONSE_2(%1+4,1)<16;4,2>   // Copy high byte in a word
+        mov (8) uwDEST_U(%1*2+1,8)<1>        ubAVS_RESPONSE_2(%1+4,8+1)<16;4,2> // Copy high byte in a word
+    }
+    // Move 2nd 8x8 words of V to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_V(%1*2,8)<1>          ubAVS_RESPONSE_2(%1+8,1)<16;4,2>   // Copy high byte in a word
+        mov (8) uwDEST_V(%1*2+1,8)<1>        ubAVS_RESPONSE_2(%1+8,8+1)<16;4,2> // Copy high byte in a word
+    }
+#endif
+//------------------------------------------------------------------------------
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL3_AVS_IEF_Unpack_8x4.asm
 ,0 → 1,45
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL3_AVS_IEF_Unpack_8x4.asm ----------
+    // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>          ubAVS_RESPONSE(%1,1)<16;4,2>        // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>        ubAVS_RESPONSE(%1,8+1)<16;4,2>      // Copy high byte in a word
+        }
+    // Move 8x4 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>            ubAVS_RESPONSE(4,1)<16;4,2>
+    mov (8) uwDEST_U(0,8)<1>          ubAVS_RESPONSE(4,9)<16;4,2>
+    mov (8) uwDEST_U(1)<1>            ubAVS_RESPONSE(5,1)<16;4,2>
+    mov (8) uwDEST_U(1,8)<1>          ubAVS_RESPONSE(5,9)<16;4,2>
+    // Move 8x4 words of V to dest GRF
+    mov (8) uwDEST_V(0)<1>            ubAVS_RESPONSE(8,1)<16;4,2>
+    mov (8) uwDEST_V(0,8)<1>          ubAVS_RESPONSE(8,9)<16;4,2>
+    mov (8) uwDEST_V(1)<1>            ubAVS_RESPONSE(9,1)<16;4,2>
+    mov (8) uwDEST_V(1,8)<1>          ubAVS_RESPONSE(9,9)<16;4,2>
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>        ubAVS_RESPONSE(%1+12,1)<16;4,2>     // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>      ubAVS_RESPONSE(%1+12,8+1)<16;4,2>   // Copy high byte in a word
+    }
+//------------------------------------------------------------------------------
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     4

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL3_AVS_IEF_Unpack_8x8.asm
 ,0 → 1,44
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL3_AVS_IEF_Unpack_8x8.asm ----------
+    // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>          ubAVS_RESPONSE(%1,1)<16;4,2>        // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>        ubAVS_RESPONSE(%1,8+1)<16;4,2>      // Copy high byte in a word
+        }
+    // Move 8x8 words of U to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_U(%1)<1>            ubAVS_RESPONSE(%1+4,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_U(%1,8)<1>          ubAVS_RESPONSE(%1+4,8+1)<16;4,2>    // Copy high byte in a word
+    }
+    // Move 8x8 words of V to dest GRF
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_V(%1)<1>            ubAVS_RESPONSE(%1+8,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_V(%1,8)<1>          ubAVS_RESPONSE(%1+8,8+1)<16;4,2>    // Copy high byte in a word
+    }
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>        ubAVS_RESPONSE(%1+12,1)<16;4,2>     // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>      ubAVS_RESPONSE(%1+12,8+1)<16;4,2>   // Copy high byte in a word
+    }
+//------------------------------------------------------------------------------
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL3_Scaling.asm
 ,0 → 1,72
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL3_Scaling.asm ----------
+#include "Scaling.inc"
+        // Build 16 elements ramp in float32 and normalized it
+//      mov (8)         SAMPLER_RAMP(0)<1>              0x76543210:v
+//      add     (8)             SAMPLER_RAMP(1)<1>              SAMPLER_RAMP(0) 8.0:f
+mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf                { NoDDClr }//3, 2, 1, 0 in float vector
+mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf      { NoDDChk }//7, 6, 5, 4 in float vector
+add     (8)             SAMPLER_RAMP(1)<1>              SAMPLER_RAMP(0) 8.0:f
+//Module: PrepareScaleCoord.asm
+        // Setup for sampler msg hdr
+    mov (2)             rMSGSRC.0<1>:ud                 0:ud                                            { NoDDClr }     // Unused fields
+    mov (1)             rMSGSRC.2<1>:ud                 0:ud                                            { NoDDChk }     // Write and offset
+        // Calculate 16 v based on the step Y and vertical origin
+        mov     (16)    mfMSGPAYLOAD(2)<1>              fSRC_VID_V_ORI<0;1,0>:f
+        mov     (16)    SCALE_COORD_Y<1>:f              fSRC_VID_V_ORI<0;1,0>:f
+        // Calculate 16 u based on the step X and hori origin
+//      line (16)       mfMSGPAYLOAD(0)<1>              SCALE_STEP_X<0;1,0>:f           SAMPLER_RAMP(0)         // Assign to mrf directly
+        mov     (16)    acc0:f                                                  fSRC_VID_H_ORI<0;1,0>:f                                                                                 { Compr }
+        mac     (16)    mfMSGPAYLOAD(0)<1>      fVIDEO_STEP_X<0;1,0>:f  SAMPLER_RAMP(0)                 { Compr }
+        //Setup the constants for line instruction
+        mov     (1)             SCALE_LINE_P255<1>:f            255.0:f                         { NoDDClr }     //{ NoDDClr, NoDDChk }
+        mov     (1)             SCALE_LINE_P0_5<1>:f            0.5:f                           { NoDDChk }
+//------------------------------------------------------------------------------
+$for (0; <nY_NUM_OF_ROWS; 1) {
+        // Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
+  mov (8)       MSGHDR_SCALE<1>:ud              rMSGSRC<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+        send (16)       SCALE_RESPONSE_VW(0)<1>         MSGHDR_SCALE    udDUMMY_NULL    nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_V+nBI_CURRENT_SRC_V
+        send (16)       SCALE_RESPONSE_YW(0)<1>         MSGHDR_SCALE    udDUMMY_NULL    nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_Y+nBI_CURRENT_SRC_Y
+        send (16)       SCALE_RESPONSE_UW(0)<1>         MSGHDR_SCALE    udDUMMY_NULL    nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_U+nBI_CURRENT_SRC_U
+        // Calculate 16 v for next line
+        add (16)        mfMSGPAYLOAD(2)<1>              SCALE_COORD_Y<8;8,1>:f          fVIDEO_STEP_Y<0;1,0>:f  // Assign to mrf directly
+        add (16)        SCALE_COORD_Y<1>:f              SCALE_COORD_Y<8;8,1>:f          fVIDEO_STEP_Y<0;1,0>:f  // Assign to mrf directly
+        // Scale back to [0, 255], convert f to ud
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_VF(0)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_VD(0)<1>       acc0:f                                                                                                          { Compr }
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_YF(0)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_YD(0)<1>       acc0:f                                                                                                          { Compr }
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_UF(0)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_UD(0)<1>       acc0:f                                                                                                          { Compr }
+        mov      (16)   DEST_V(%1)<1>                           SCALE_RESPONSE_VB(0)                                                                                    //possible error due to truncation - vK
+        mov      (16)   DEST_Y(%1)<1>                           SCALE_RESPONSE_YB(0)                                                                                    //possible error due to truncation - vK
+        mov      (16)   DEST_U(%1)<1>                           SCALE_RESPONSE_UB(0)                                                                                    //possible error due to truncation - vK
+}
+        #define nSRC_REGION                             nREGION_1
+//------------------------------------------------------------------------------

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL_DNDI_ALG.asm
 ,0 → 1,85
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#define DI_ENABLE
+    #include "DNDI.inc"
+    #ifdef DI_ONLY
+                #undef  nSMPL_RESP_LEN
+                #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DI               // set the number of GRF
+        #else
+                #undef  nSMPL_RESP_LEN
+                #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF
+        #endif
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // DN Block Size for Write is 16x4
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w  NODDCLR_NODDCHK             // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                 NODDCLR_NODDCHK    // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud        NODDCHK             // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud
+#ifdef DI_ONLY
+#else
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+    #include "DI_Hist_Save.asm"
+////////////////////////////////////// Save the DN Curr Frame for Next Run ////////////////////////
+        //set the save DN parameters
+    mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w              NODDCLR             // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud       NODDCLR_NODDCHK     // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    // check top/bottom field first
+    cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+BOTTOM_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2)
+    }
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+TOP_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2)
+    }
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud
+#endif
+// Save Processed frames
+#include "DI_Save_PA.asm"

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL_DNDI_ALG_UVCopy_NV11.asm
 ,0 → 1,103
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#define DI_ENABLE
+    #include "DNDI.inc"
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame)
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+    #undef  nSMPL_RESP_LEN
+    #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // DN Block Size for Write is 16x4
+    #undef  nDPR_BLOCK_SIZE_UV
+    #define nDPR_BLOCK_SIZE_UV                  nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4   //  DN Block Size for UV Write/Read is 8x4
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    // move the previous frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the previous frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+    // move the current frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the current frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud           // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+    #include "DI_Hist_Save.asm"
+////////////////////////////////////// Save the DN Curr Frame for Next Run ////////////////////////
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:w
+    // check top/bottom field first
+    cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+BOTTOM_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+TOP_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud        // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud
+/////////////////////////////P208 UV Copy 422/////////////////////////////////////////////////////
+                //Read UV through DATAPORT
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+    asr (1)  rMSGSRC.0<1>:d     rMSGSRC.0<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (16x2)
+    mov  (8) mudMSGHDR_DN<1>     rMSGSRC<8;8,1>:ud
+    send (8) udBOT_U_IO(0)<1>     mMSGHDR_DN    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_UV:ud
+                //Write UV through DATAPORT
+                mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+                asr (1)     rMSGSRC.0<1>:d    rMSGSRC.0<0;1,0>:d    1:w  // U/V block origin should be half of Y's
+    mov (1)     rMSGSRC.2<1>:ud        nDPR_BLOCK_SIZE_UV:ud        // block width and height (16x2)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    mov (8)                     mudMSGHDR_DN(1)<1>               udBOT_U_IO(0)<8;8,1>
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_UV:ud

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL_DNDI_ALG_UVCopy_NV12.asm
 ,0 → 1,107
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#define DI_ENABLE
+    #include "DNDI.inc"
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame)
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+    #undef  nSMPL_RESP_LEN
+    #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // DN Block Size for Write is 16x4
+    #undef  nDPR_BLOCK_SIZE_UV
+    #define nDPR_BLOCK_SIZE_UV                  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_2   // DN Block Size for UV Write/Read is 16x2
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_COMMAND.asm"
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    // move the previous frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the previous frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+    // move the current frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the current frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud           // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+    #include "DI_Hist_Save.asm"
+////////////////////////////////////// Save the DN Curr Frame for Next Run ////////////////////////
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:w
+    // check top/bottom field first
+    cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+BOTTOM_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+TOP_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) mubMSGHDR_DN(1, %1*16)<1>       ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+    mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud        // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud
+/////////////////////////////NV12 UV Copy 422/////////////////////////////////////////////////////
+                //Read UV through DATAPORT
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+    asr (1)  rMSGSRC.1<1>:d     rMSGSRC.1<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (16x2)
+    mov  (8) mudMSGHDR_DN<1>     rMSGSRC<8;8,1>:ud
+    send (8) udBOT_U_IO(0)<1>     mMSGHDR_DN    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_UV:ud
+                //Write UV through DATAPORT
+                mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+                asr (1)     rMSGSRC.1<1>:d         rMSGSRC.1<0;1,0>:d    1:w  // U/V block origin should be half of Y's
+    mov (1)     rMSGSRC.2<1>:ud        nDPR_BLOCK_SIZE_UV:ud        // block width and height (16x2)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    mov (8)                     mudMSGHDR_DN(1)<1>               udBOT_U_IO(0)<8;8,1>
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_UV:ud

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL_DNDI_ALG_UVCopy_P208.asm
 ,0 → 1,101
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#define DI_ENABLE
+    #include "DNDI.inc"
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame)
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+    #undef  nSMPL_RESP_LEN
+    #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // DN Block Size for Write is 16x4
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    // move the previous frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the previous frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+    // move the current frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the current frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud           // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+    #include "DI_Hist_Save.asm"
+////////////////////////////////////// Save the DN Curr Frame for Next Run ////////////////////////
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:w
+    // check top/bottom field first
+    cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+BOTTOM_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+TOP_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud        // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud
+/////////////////////////////P208 UV Copy 422/////////////////////////////////////////////////////
+                //Read UV through DATAPORT
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+    mov  (1) rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_DN:ud                                                   // Y Block width and height (16x4) (U/V block size is the same)
+    mov  (8) mudMSGHDR_DN<1>     rMSGSRC<8;8,1>:ud
+    send (8) udBOT_U_IO(0)<1>     mMSGHDR_DN    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nRESLEN_2+nBI_CURRENT_SRC_UV:ud
+                //Write UV through DATAPORT
+                mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud        // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    mov (8)                     mudMSGHDR_DN(1)<1>               udBOT_U_IO(0)<8;8,1>
+    mov (8)                     mudMSGHDR_DN(2)<1>               udBOT_U_IO(1)<8;8,1>
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_UV:ud

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL_DNDI_ALG_UVCopy_PL3.asm
 ,0 → 1,106
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#define DI_ENABLE
+    #include "DNDI.inc"
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame)
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+    #undef  nSMPL_RESP_LEN
+    #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // DN Block Size for Write is 16x4
+    #undef  nDPR_BLOCK_SIZE_UV
+    #define nDPR_BLOCK_SIZE_UV                  nBLOCK_WIDTH_8+nBLOCK_HEIGHT_2   //  DN Block Size for UV Write/Read is 8x2
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    // move the previous frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the previous frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+    // move the current frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the current frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud           // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+    #include "DI_Hist_Save.asm"
+////////////////////////////////////// Save the DN Curr Frame for Next Run ////////////////////////
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:w
+    // check top/bottom field first
+    cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+BOTTOM_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+TOP_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud        // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud
+/////////////////////////////IMC3 UV Copy 422/////////////////////////////////////////////////////
+                //Read UV through DATAPORT
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+    asr (2)  rMSGSRC.0<1>:d     rMSGSRC.0<2;2,1>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (8x2)
+    mov  (8) mudMSGHDR_DN<1>     rMSGSRC<8;8,1>:ud
+    send (4) udBOT_U_IO(0)<1>       mMSGHDR_DN    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_U:ud
+    send (4) udBOT_V_IO(0)<1>       mMSGHDR_DN    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_V:ud
+                //Write UV through DATAPORT
+                mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+                asr  (2)    rMSGSRC.0<1>:d     wORIX<2;2,1>:w    1:w   // U/V block origin should be half of Y's
+    mov (1)     rMSGSRC.2<1>:ud        nDPR_BLOCK_SIZE_UV:ud        // block width and height (8x2)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    mov (4)                     mudMSGHDR_DN(1)<1>               udBOT_U_IO(0)<4;4,1>
+    send (4)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_U:ud
+    mov (4)                     mudMSGHDR_DN(1)<1>               udBOT_V_IO(0)<4;4,1>
+    send (4)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_V:ud

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/PL_DN_ALG.asm
 ,0 → 1,35
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#define DI_DISABLE
+#include "DNDI.inc"
+#undef  nY_NUM_OF_ROWS
+#define nY_NUM_OF_ROWS         8                                 // Number of Y rows per block
+#undef   nSMPL_RESP_LEN
+#define  nSMPL_RESP_LEN        nSMPL_RESP_LEN_DN_PL              // Set the Number of GRFs in DNDI response
+#undef   nDPW_BLOCK_SIZE_DN
+#define  nDPW_BLOCK_SIZE_DN    nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8   // DN Curr Block Size for Write is 16x8
+#undef   nDPW_BLOCK_SIZE_HIST
+#define  nDPW_BLOCK_SIZE_HIST  nBLOCK_WIDTH_4+nBLOCK_HEIGHT_2    // HIST Block Size for Write is 4x2
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+#include "DNDI_COMMAND.asm"
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+$for (0; <nY_NUM_OF_ROWS; 1) {
+    mov (16)    uwDEST_Y(0,%1*16)<1>   ubRESP(nNODI_LUMA_OFFSET,%1*16)<16;16,1>       // copy line of Y
+}
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+#include "DNDI_Hist_Save.asm"

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/RGB_AVS_IEF_16x8.asm
 ,0 → 1,33
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- RGB_AVS_IEF_16x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 ARGB packed
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_ALL_CHANNELS:ud   // Enable ARGB channels
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_4CH+nSI_SRC_RGB+nBI_CURRENT_SRC_YUV
+    // Return ARGB in 16 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    mov (16) mAVS_8x8_HDR_2.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR_2    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_4CH+nSI_SRC_RGB+nBI_CURRENT_SRC_YUV
+    // Return ARGB in 16 GRFs

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/RGB_AVS_IEF_Unpack_16x8.asm
 ,0 → 1,251
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- RGB_AVS_IEF_Unpack_16x8.asm ----------
+#include "AVS_IEF.inc"
+#ifdef AVS_OUTPUT_16_BIT
+// Move first 8x8 words of B to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,2)<4>       uwAVS_RESPONSE(4,0)<4;4,1>
+    mov (4) uwDEST_Y(1,2)<4>       uwAVS_RESPONSE(4,8)<4;4,1>
+    mov (4) uwDEST_Y(4,2)<4>       uwAVS_RESPONSE(4,4)<4;4,1>
+    mov (4) uwDEST_Y(5,2)<4>       uwAVS_RESPONSE(4,12)<4;4,1>
+    mov (4) uwDEST_Y(8,2)<4>       uwAVS_RESPONSE(5,0)<4;4,1>
+    mov (4) uwDEST_Y(9,2)<4>       uwAVS_RESPONSE(5,8)<4;4,1>
+    mov (4) uwDEST_Y(12,2)<4>      uwAVS_RESPONSE(5,4)<4;4,1>
+    mov (4) uwDEST_Y(13,2)<4>      uwAVS_RESPONSE(5,12)<4;4,1>
+    mov (4) uwDEST_Y(16,2)<4>      uwAVS_RESPONSE(12,0)<4;4,1>
+    mov (4) uwDEST_Y(17,2)<4>      uwAVS_RESPONSE(12,8)<4;4,1>
+    mov (4) uwDEST_Y(20,2)<4>      uwAVS_RESPONSE(12,4)<4;4,1>
+    mov (4) uwDEST_Y(21,2)<4>      uwAVS_RESPONSE(12,12)<4;4,1>
+    mov (4) uwDEST_Y(24,2)<4>      uwAVS_RESPONSE(13,0)<4;4,1>
+    mov (4) uwDEST_Y(25,2)<4>      uwAVS_RESPONSE(13,8)<4;4,1>
+    mov (4) uwDEST_Y(28,2)<4>      uwAVS_RESPONSE(13,4)<4;4,1>
+    mov (4) uwDEST_Y(29,2)<4>      uwAVS_RESPONSE(13,12)<4;4,1>
+// Move first 8x8 words of G to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,1)<4>       uwAVS_RESPONSE(2,0)<4;4,1>
+    mov (4) uwDEST_Y(1,1)<4>       uwAVS_RESPONSE(2,8)<4;4,1>
+    mov (4) uwDEST_Y(4,1)<4>       uwAVS_RESPONSE(2,4)<4;4,1>
+    mov (4) uwDEST_Y(5,1)<4>       uwAVS_RESPONSE(2,12)<4;4,1>
+    mov (4) uwDEST_Y(8,1)<4>       uwAVS_RESPONSE(3,0)<4;4,1>
+    mov (4) uwDEST_Y(9,1)<4>       uwAVS_RESPONSE(3,8)<4;4,1>
+    mov (4) uwDEST_Y(12,1)<4>      uwAVS_RESPONSE(3,4)<4;4,1>
+    mov (4) uwDEST_Y(13,1)<4>      uwAVS_RESPONSE(3,12)<4;4,1>
+    mov (4) uwDEST_Y(16,1)<4>      uwAVS_RESPONSE(10,0)<4;4,1>
+    mov (4) uwDEST_Y(17,1)<4>      uwAVS_RESPONSE(10,8)<4;4,1>
+    mov (4) uwDEST_Y(20,1)<4>      uwAVS_RESPONSE(10,4)<4;4,1>
+    mov (4) uwDEST_Y(21,1)<4>      uwAVS_RESPONSE(10,12)<4;4,1>
+    mov (4) uwDEST_Y(24,1)<4>      uwAVS_RESPONSE(11,0)<4;4,1>
+    mov (4) uwDEST_Y(25,1)<4>      uwAVS_RESPONSE(11,8)<4;4,1>
+    mov (4) uwDEST_Y(28,1)<4>      uwAVS_RESPONSE(11,4)<4;4,1>
+    mov (4) uwDEST_Y(29,1)<4>      uwAVS_RESPONSE(11,12)<4;4,1>
+// Move first 8x8 words of R to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,0)<4>       uwAVS_RESPONSE(0,0)<4;4,1>
+    mov (4) uwDEST_Y(1,0)<4>       uwAVS_RESPONSE(0,8)<4;4,1>
+    mov (4) uwDEST_Y(4,0)<4>       uwAVS_RESPONSE(0,4)<4;4,1>
+    mov (4) uwDEST_Y(5,0)<4>       uwAVS_RESPONSE(0,12)<4;4,1>
+    mov (4) uwDEST_Y(8,0)<4>       uwAVS_RESPONSE(1,0)<4;4,1>
+    mov (4) uwDEST_Y(9,0)<4>       uwAVS_RESPONSE(1,8)<4;4,1>
+    mov (4) uwDEST_Y(12,0)<4>      uwAVS_RESPONSE(1,4)<4;4,1>
+    mov (4) uwDEST_Y(13,0)<4>      uwAVS_RESPONSE(1,12)<4;4,1>
+    mov (4) uwDEST_Y(16,0)<4>      uwAVS_RESPONSE(8,0)<4;4,1>
+    mov (4) uwDEST_Y(17,0)<4>      uwAVS_RESPONSE(8,8)<4;4,1>
+    mov (4) uwDEST_Y(20,0)<4>      uwAVS_RESPONSE(8,4)<4;4,1>
+    mov (4) uwDEST_Y(21,0)<4>      uwAVS_RESPONSE(8,12)<4;4,1>
+    mov (4) uwDEST_Y(24,0)<4>      uwAVS_RESPONSE(9,0)<4;4,1>
+    mov (4) uwDEST_Y(25,0)<4>      uwAVS_RESPONSE(9,8)<4;4,1>
+    mov (4) uwDEST_Y(28,0)<4>      uwAVS_RESPONSE(9,4)<4;4,1>
+    mov (4) uwDEST_Y(29,0)<4>      uwAVS_RESPONSE(9,12)<4;4,1>
+// Move first 8x8 words of A to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,3)<4>       uwAVS_RESPONSE(6,0)<4;4,1>
+    mov (4) uwDEST_Y(1,3)<4>       uwAVS_RESPONSE(6,8)<4;4,1>
+    mov (4) uwDEST_Y(4,3)<4>       uwAVS_RESPONSE(6,4)<4;4,1>
+    mov (4) uwDEST_Y(5,3)<4>       uwAVS_RESPONSE(6,12)<4;4,1>
+    mov (4) uwDEST_Y(8,3)<4>       uwAVS_RESPONSE(7,0)<4;4,1>
+    mov (4) uwDEST_Y(9,3)<4>       uwAVS_RESPONSE(7,8)<4;4,1>
+    mov (4) uwDEST_Y(12,3)<4>      uwAVS_RESPONSE(7,4)<4;4,1>
+    mov (4) uwDEST_Y(13,3)<4>      uwAVS_RESPONSE(7,12)<4;4,1>
+    mov (4) uwDEST_Y(16,3)<4>      uwAVS_RESPONSE(14,0)<4;4,1>
+    mov (4) uwDEST_Y(17,3)<4>      uwAVS_RESPONSE(14,8)<4;4,1>
+    mov (4) uwDEST_Y(20,3)<4>      uwAVS_RESPONSE(14,4)<4;4,1>
+    mov (4) uwDEST_Y(21,3)<4>      uwAVS_RESPONSE(14,12)<4;4,1>
+    mov (4) uwDEST_Y(24,3)<4>      uwAVS_RESPONSE(15,0)<4;4,1>
+    mov (4) uwDEST_Y(25,3)<4>      uwAVS_RESPONSE(15,8)<4;4,1>
+    mov (4) uwDEST_Y(28,3)<4>      uwAVS_RESPONSE(15,4)<4;4,1>
+    mov (4) uwDEST_Y(29,3)<4>      uwAVS_RESPONSE(15,12)<4;4,1>
+// Move second 8x8 words of B to dest GRF
+    mov (4) uwDEST_Y(2,2)<4>       uwAVS_RESPONSE_2(4,0)<4;4,1>
+    mov (4) uwDEST_Y(3,2)<4>       uwAVS_RESPONSE_2(4,8)<4;4,1>
+    mov (4) uwDEST_Y(6,2)<4>       uwAVS_RESPONSE_2(4,4)<4;4,1>
+    mov (4) uwDEST_Y(7,2)<4>       uwAVS_RESPONSE_2(4,12)<4;4,1>
+    mov (4) uwDEST_Y(10,2)<4>      uwAVS_RESPONSE_2(5,0)<4;4,1>
+    mov (4) uwDEST_Y(11,2)<4>      uwAVS_RESPONSE_2(5,8)<4;4,1>
+    mov (4) uwDEST_Y(14,2)<4>      uwAVS_RESPONSE_2(5,4)<4;4,1>
+    mov (4) uwDEST_Y(15,2)<4>      uwAVS_RESPONSE_2(5,12)<4;4,1>
+    mov (4) uwDEST_Y(18,2)<4>      uwAVS_RESPONSE_2(12,0)<4;4,1>
+    mov (4) uwDEST_Y(19,2)<4>      uwAVS_RESPONSE_2(12,8)<4;4,1>
+    mov (4) uwDEST_Y(22,2)<4>      uwAVS_RESPONSE_2(12,4)<4;4,1>
+    mov (4) uwDEST_Y(23,2)<4>      uwAVS_RESPONSE_2(12,12)<4;4,1>
+    mov (4) uwDEST_Y(26,2)<4>      uwAVS_RESPONSE_2(13,0)<4;4,1>
+    mov (4) uwDEST_Y(27,2)<4>      uwAVS_RESPONSE_2(13,8)<4;4,1>
+    mov (4) uwDEST_Y(30,2)<4>      uwAVS_RESPONSE_2(13,4)<4;4,1>
+    mov (4) uwDEST_Y(31,2)<4>      uwAVS_RESPONSE_2(13,12)<4;4,1>
+// Move second 8x8 words of G to dest GRF
+    mov (4) uwDEST_Y(2,1)<4>       uwAVS_RESPONSE_2(2,0)<4;4,1>
+    mov (4) uwDEST_Y(3,1)<4>       uwAVS_RESPONSE_2(2,8)<4;4,1>
+    mov (4) uwDEST_Y(6,1)<4>       uwAVS_RESPONSE_2(2,4)<4;4,1>
+    mov (4) uwDEST_Y(7,1)<4>       uwAVS_RESPONSE_2(2,12)<4;4,1>
+    mov (4) uwDEST_Y(10,1)<4>      uwAVS_RESPONSE_2(3,0)<4;4,1>
+    mov (4) uwDEST_Y(11,1)<4>      uwAVS_RESPONSE_2(3,8)<4;4,1>
+    mov (4) uwDEST_Y(14,1)<4>      uwAVS_RESPONSE_2(3,4)<4;4,1>
+    mov (4) uwDEST_Y(15,1)<4>      uwAVS_RESPONSE_2(3,12)<4;4,1>
+    mov (4) uwDEST_Y(18,1)<4>      uwAVS_RESPONSE_2(10,0)<4;4,1>
+    mov (4) uwDEST_Y(19,1)<4>      uwAVS_RESPONSE_2(10,8)<4;4,1>
+    mov (4) uwDEST_Y(22,1)<4>      uwAVS_RESPONSE_2(10,4)<4;4,1>
+    mov (4) uwDEST_Y(23,1)<4>      uwAVS_RESPONSE_2(10,12)<4;4,1>
+    mov (4) uwDEST_Y(26,1)<4>      uwAVS_RESPONSE_2(11,0)<4;4,1>
+    mov (4) uwDEST_Y(27,1)<4>      uwAVS_RESPONSE_2(11,8)<4;4,1>
+    mov (4) uwDEST_Y(30,1)<4>      uwAVS_RESPONSE_2(11,4)<4;4,1>
+    mov (4) uwDEST_Y(31,1)<4>      uwAVS_RESPONSE_2(11,12)<4;4,1>
+// Move second 8x8 words of R to dest GRF
+    mov (4) uwDEST_Y(2,0)<4>       uwAVS_RESPONSE_2(0,0)<4;4,1>
+    mov (4) uwDEST_Y(3,0)<4>       uwAVS_RESPONSE_2(0,8)<4;4,1>
+    mov (4) uwDEST_Y(6,0)<4>       uwAVS_RESPONSE_2(0,4)<4;4,1>
+    mov (4) uwDEST_Y(7,0)<4>       uwAVS_RESPONSE_2(0,12)<4;4,1>
+    mov (4) uwDEST_Y(10,0)<4>      uwAVS_RESPONSE_2(1,0)<4;4,1>
+    mov (4) uwDEST_Y(11,0)<4>      uwAVS_RESPONSE_2(1,8)<4;4,1>
+    mov (4) uwDEST_Y(14,0)<4>      uwAVS_RESPONSE_2(1,4)<4;4,1>
+    mov (4) uwDEST_Y(15,0)<4>      uwAVS_RESPONSE_2(1,12)<4;4,1>
+    mov (4) uwDEST_Y(18,0)<4>      uwAVS_RESPONSE_2(8,0)<4;4,1>
+    mov (4) uwDEST_Y(19,0)<4>      uwAVS_RESPONSE_2(8,8)<4;4,1>
+    mov (4) uwDEST_Y(22,0)<4>      uwAVS_RESPONSE_2(8,4)<4;4,1>
+    mov (4) uwDEST_Y(23,0)<4>      uwAVS_RESPONSE_2(8,12)<4;4,1>
+    mov (4) uwDEST_Y(26,0)<4>      uwAVS_RESPONSE_2(9,0)<4;4,1>
+    mov (4) uwDEST_Y(27,0)<4>      uwAVS_RESPONSE_2(9,8)<4;4,1>
+    mov (4) uwDEST_Y(30,0)<4>      uwAVS_RESPONSE_2(9,4)<4;4,1>
+    mov (4) uwDEST_Y(31,0)<4>      uwAVS_RESPONSE_2(9,12)<4;4,1>
+// Move second 8x8 words of A to dest GRF
+    mov (4) uwDEST_Y(2,3)<4>       uwAVS_RESPONSE_2(6,0)<4;4,1>
+    mov (4) uwDEST_Y(3,3)<4>       uwAVS_RESPONSE_2(6,8)<4;4,1>
+    mov (4) uwDEST_Y(6,3)<4>       uwAVS_RESPONSE_2(6,4)<4;4,1>
+    mov (4) uwDEST_Y(7,3)<4>       uwAVS_RESPONSE_2(6,12)<4;4,1>
+    mov (4) uwDEST_Y(10,3)<4>      uwAVS_RESPONSE_2(7,0)<4;4,1>
+    mov (4) uwDEST_Y(11,3)<4>      uwAVS_RESPONSE_2(7,8)<4;4,1>
+    mov (4) uwDEST_Y(14,3)<4>      uwAVS_RESPONSE_2(7,4)<4;4,1>
+    mov (4) uwDEST_Y(15,3)<4>      uwAVS_RESPONSE_2(7,12)<4;4,1>
+    mov (4) uwDEST_Y(18,3)<4>      uwAVS_RESPONSE_2(14,0)<4;4,1>
+    mov (4) uwDEST_Y(19,3)<4>      uwAVS_RESPONSE_2(14,8)<4;4,1>
+    mov (4) uwDEST_Y(22,3)<4>      uwAVS_RESPONSE_2(14,4)<4;4,1>
+    mov (4) uwDEST_Y(23,3)<4>      uwAVS_RESPONSE_2(14,12)<4;4,1>
+    mov (4) uwDEST_Y(26,3)<4>      uwAVS_RESPONSE_2(15,0)<4;4,1>
+    mov (4) uwDEST_Y(27,3)<4>      uwAVS_RESPONSE_2(15,8)<4;4,1>
+    mov (4) uwDEST_Y(30,3)<4>      uwAVS_RESPONSE_2(15,4)<4;4,1>
+    mov (4) uwDEST_Y(31,3)<4>      uwAVS_RESPONSE_2(15,12)<4;4,1>
+#else   /* OUTPUT_8_BIT */
+// Move first 8x8 words of B to dest GRF
+    mov (8) ubDEST_Y(0,2)<4>       ubAVS_RESPONSE(4,1)<16;4,2>
+    mov (8) ubDEST_Y(2,2)<4>       ubAVS_RESPONSE(4,8+1)<16;4,2>
+    mov (8) ubDEST_Y(4,2)<4>       ubAVS_RESPONSE(5,1)<16;4,2>
+    mov (8) ubDEST_Y(6,2)<4>       ubAVS_RESPONSE(5,8+1)<16;4,2>
+    mov (8) ubDEST_Y(8,2)<4>       ubAVS_RESPONSE(12,1)<16;4,2>
+    mov (8) ubDEST_Y(10,2)<4>      ubAVS_RESPONSE(12,8+1)<16;4,2>
+    mov (8) ubDEST_Y(12,2)<4>      ubAVS_RESPONSE(13,1)<16;4,2>
+    mov (8) ubDEST_Y(14,2)<4>      ubAVS_RESPONSE(13,8+1)<16;4,2>
+// Move first 8x8 words of G to dest GRF
+    mov (8) ubDEST_Y(0,1)<4>       ubAVS_RESPONSE(2,1)<16;4,2>
+    mov (8) ubDEST_Y(2,1)<4>       ubAVS_RESPONSE(2,8+1)<16;4,2>
+    mov (8) ubDEST_Y(4,1)<4>       ubAVS_RESPONSE(3,1)<16;4,2>
+    mov (8) ubDEST_Y(6,1)<4>       ubAVS_RESPONSE(3,8+1)<16;4,2>
+    mov (8) ubDEST_Y(8,1)<4>       ubAVS_RESPONSE(10,1)<16;4,2>
+    mov (8) ubDEST_Y(10,1)<4>      ubAVS_RESPONSE(10,8+1)<16;4,2>
+    mov (8) ubDEST_Y(12,1)<4>      ubAVS_RESPONSE(11,1)<16;4,2>
+    mov (8) ubDEST_Y(14,1)<4>      ubAVS_RESPONSE(11,8+1)<16;4,2>
+// Move first 8x8 words of R to dest GRF
+    mov (8) ubDEST_Y(0,0)<4>       ubAVS_RESPONSE(0,1)<16;4,2>
+    mov (8) ubDEST_Y(2,0)<4>       ubAVS_RESPONSE(0,8+1)<16;4,2>
+    mov (8) ubDEST_Y(4,0)<4>       ubAVS_RESPONSE(1,1)<16;4,2>
+    mov (8) ubDEST_Y(6,0)<4>       ubAVS_RESPONSE(1,8+1)<16;4,2>
+    mov (8) ubDEST_Y(8,0)<4>       ubAVS_RESPONSE(8,1)<16;4,2>
+    mov (8) ubDEST_Y(10,0)<4>      ubAVS_RESPONSE(8,8+1)<16;4,2>
+    mov (8) ubDEST_Y(12,0)<4>      ubAVS_RESPONSE(9,1)<16;4,2>
+    mov (8) ubDEST_Y(14,0)<4>      ubAVS_RESPONSE(9,8+1)<16;4,2>
+// Move first 8x8 words of A to dest GRF
+    mov (8) ubDEST_Y(0,3)<4>       ubAVS_RESPONSE(6,1)<16;4,2>
+    mov (8) ubDEST_Y(2,3)<4>       ubAVS_RESPONSE(6,8+1)<16;4,2>
+    mov (8) ubDEST_Y(4,3)<4>       ubAVS_RESPONSE(7,1)<16;4,2>
+    mov (8) ubDEST_Y(6,3)<4>       ubAVS_RESPONSE(7,8+1)<16;4,2>
+    mov (8) ubDEST_Y(8,3)<4>       ubAVS_RESPONSE(14,1)<16;4,2>
+    mov (8) ubDEST_Y(10,3)<4>      ubAVS_RESPONSE(14,8+1)<16;4,2>
+    mov (8) ubDEST_Y(12,3)<4>      ubAVS_RESPONSE(15,1)<16;4,2>
+    mov (8) ubDEST_Y(14,3)<4>      ubAVS_RESPONSE(15,8+1)<16;4,2>
+// Move second 8x8 words of B to dest GRF
+    mov (8) ubDEST_Y(1,2)<4>       ubAVS_RESPONSE_2(4,1)<16;4,2>
+    mov (8) ubDEST_Y(3,2)<4>       ubAVS_RESPONSE_2(4,8+1)<16;4,2>
+    mov (8) ubDEST_Y(5,2)<4>       ubAVS_RESPONSE_2(5,1)<16;4,2>
+    mov (8) ubDEST_Y(7,2)<4>       ubAVS_RESPONSE_2(5,8+1)<16;4,2>
+    mov (8) ubDEST_Y(9,2)<4>       ubAVS_RESPONSE_2(12,1)<16;4,2>
+    mov (8) ubDEST_Y(11,2)<4>      ubAVS_RESPONSE_2(12,8+1)<16;4,2>
+    mov (8) ubDEST_Y(13,2)<4>      ubAVS_RESPONSE_2(13,1)<16;4,2>
+    mov (8) ubDEST_Y(15,2)<4>      ubAVS_RESPONSE_2(13,8+1)<16;4,2>
+// Move second 8x8 words of G to dest GRF
+    mov (8) ubDEST_Y(1,1)<4>       ubAVS_RESPONSE_2(2,1)<16;4,2>
+    mov (8) ubDEST_Y(3,1)<4>       ubAVS_RESPONSE_2(2,8+1)<16;4,2>
+    mov (8) ubDEST_Y(5,1)<4>       ubAVS_RESPONSE_2(3,1)<16;4,2>
+    mov (8) ubDEST_Y(7,1)<4>       ubAVS_RESPONSE_2(3,8+1)<16;4,2>
+    mov (8) ubDEST_Y(9,1)<4>       ubAVS_RESPONSE_2(10,1)<16;4,2>
+    mov (8) ubDEST_Y(11,1)<4>      ubAVS_RESPONSE_2(10,8+1)<16;4,2>
+    mov (8) ubDEST_Y(13,1)<4>      ubAVS_RESPONSE_2(11,1)<16;4,2>
+    mov (8) ubDEST_Y(15,1)<4>      ubAVS_RESPONSE_2(11,8+1)<16;4,2>
+// Move second 8x8 words of R to dest GRF
+    mov (8) ubDEST_Y(1,0)<4>       ubAVS_RESPONSE_2(0,1)<16;4,2>
+    mov (8) ubDEST_Y(3,0)<4>       ubAVS_RESPONSE_2(0,8+1)<16;4,2>
+    mov (8) ubDEST_Y(5,0)<4>       ubAVS_RESPONSE_2(1,1)<16;4,2>
+    mov (8) ubDEST_Y(7,0)<4>       ubAVS_RESPONSE_2(1,8+1)<16;4,2>
+    mov (8) ubDEST_Y(9,0)<4>       ubAVS_RESPONSE_2(8,1)<16;4,2>
+    mov (8) ubDEST_Y(11,0)<4>      ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) ubDEST_Y(13,0)<4>      ubAVS_RESPONSE_2(9,1)<16;4,2>
+    mov (8) ubDEST_Y(15,0)<4>      ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+// Move second 8x8 words of A to dest GRF
+    mov (8) ubDEST_Y(1,3)<4>       ubAVS_RESPONSE_2(6,1)<16;4,2>
+    mov (8) ubDEST_Y(3,3)<4>       ubAVS_RESPONSE_2(6,8+1)<16;4,2>
+    mov (8) ubDEST_Y(5,3)<4>       ubAVS_RESPONSE_2(7,1)<16;4,2>
+    mov (8) ubDEST_Y(7,3)<4>       ubAVS_RESPONSE_2(7,8+1)<16;4,2>
+    mov (8) ubDEST_Y(9,3)<4>       ubAVS_RESPONSE_2(14,1)<16;4,2>
+    mov (8) ubDEST_Y(11,3)<4>      ubAVS_RESPONSE_2(14,8+1)<16;4,2>
+    mov (8) ubDEST_Y(13,3)<4>      ubAVS_RESPONSE_2(15,1)<16;4,2>
+    mov (8) ubDEST_Y(15,3)<4>      ubAVS_RESPONSE_2(15,8+1)<16;4,2>
+#endif
+//------------------------------------------------------------------------------
+    // Set to write bottom region to memory
+    #define SRC_REGION                              REGION_2
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/RGB_AVS_IEF_Unscramble_16x8.asm
 ,0 → 1,260
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- RGB_AVS_IEF_Unpack_16x8.asm ----------
+#include "AVS_IEF.inc"
+.declare DEST_B         Base=REG(r,10)  ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_G         Base=REG(r,18)  ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_R         Base=REG(r,26)  ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_A         Base=REG(r,34)  ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+#ifdef AVS_OUTPUT_16_BIT
+//This portion will need to be changed if unpacking is required for Y416 kernels (in case of blending etc) - vK
+//// Move first 8x8 words of B to dest GRF (as packed)
+//    mov (4) uwDEST_Y(0,2)<4>       uwAVS_RESPONSE(4,0)<4;4,1>
+//    mov (4) uwDEST_Y(1,2)<4>       uwAVS_RESPONSE(4,8)<4;4,1>
+//    mov (4) uwDEST_Y(4,2)<4>       uwAVS_RESPONSE(4,4)<4;4,1>
+//    mov (4) uwDEST_Y(5,2)<4>       uwAVS_RESPONSE(4,12)<4;4,1>
+//    mov (4) uwDEST_Y(8,2)<4>       uwAVS_RESPONSE(5,0)<4;4,1>
+//    mov (4) uwDEST_Y(9,2)<4>       uwAVS_RESPONSE(5,8)<4;4,1>
+//    mov (4) uwDEST_Y(12,2)<4>      uwAVS_RESPONSE(5,4)<4;4,1>
+//    mov (4) uwDEST_Y(13,2)<4>      uwAVS_RESPONSE(5,12)<4;4,1>
+//    mov (4) uwDEST_Y(16,2)<4>      uwAVS_RESPONSE(12,0)<4;4,1>
+//    mov (4) uwDEST_Y(17,2)<4>      uwAVS_RESPONSE(12,8)<4;4,1>
+//    mov (4) uwDEST_Y(20,2)<4>      uwAVS_RESPONSE(12,4)<4;4,1>
+//    mov (4) uwDEST_Y(21,2)<4>      uwAVS_RESPONSE(12,12)<4;4,1>
+//    mov (4) uwDEST_Y(24,2)<4>      uwAVS_RESPONSE(13,0)<4;4,1>
+//    mov (4) uwDEST_Y(25,2)<4>      uwAVS_RESPONSE(13,8)<4;4,1>
+//    mov (4) uwDEST_Y(28,2)<4>      uwAVS_RESPONSE(13,4)<4;4,1>
+//    mov (4) uwDEST_Y(29,2)<4>      uwAVS_RESPONSE(13,12)<4;4,1>
+//
+//// Move first 8x8 words of G to dest GRF (as packed)
+//    mov (4) uwDEST_Y(0,1)<4>       uwAVS_RESPONSE(2,0)<4;4,1>
+//    mov (4) uwDEST_Y(1,1)<4>       uwAVS_RESPONSE(2,8)<4;4,1>
+//    mov (4) uwDEST_Y(4,1)<4>       uwAVS_RESPONSE(2,4)<4;4,1>
+//    mov (4) uwDEST_Y(5,1)<4>       uwAVS_RESPONSE(2,12)<4;4,1>
+//    mov (4) uwDEST_Y(8,1)<4>       uwAVS_RESPONSE(3,0)<4;4,1>
+//    mov (4) uwDEST_Y(9,1)<4>       uwAVS_RESPONSE(3,8)<4;4,1>
+//    mov (4) uwDEST_Y(12,1)<4>      uwAVS_RESPONSE(3,4)<4;4,1>
+//    mov (4) uwDEST_Y(13,1)<4>      uwAVS_RESPONSE(3,12)<4;4,1>
+//    mov (4) uwDEST_Y(16,1)<4>      uwAVS_RESPONSE(10,0)<4;4,1>
+//    mov (4) uwDEST_Y(17,1)<4>      uwAVS_RESPONSE(10,8)<4;4,1>
+//    mov (4) uwDEST_Y(20,1)<4>      uwAVS_RESPONSE(10,4)<4;4,1>
+//    mov (4) uwDEST_Y(21,1)<4>      uwAVS_RESPONSE(10,12)<4;4,1>
+//    mov (4) uwDEST_Y(24,1)<4>      uwAVS_RESPONSE(11,0)<4;4,1>
+//    mov (4) uwDEST_Y(25,1)<4>      uwAVS_RESPONSE(11,8)<4;4,1>
+//    mov (4) uwDEST_Y(28,1)<4>      uwAVS_RESPONSE(11,4)<4;4,1>
+//    mov (4) uwDEST_Y(29,1)<4>      uwAVS_RESPONSE(11,12)<4;4,1>
+//
+//// Move first 8x8 words of R to dest GRF (as packed)
+//    mov (4) uwDEST_Y(0,0)<4>       uwAVS_RESPONSE(0,0)<4;4,1>
+//    mov (4) uwDEST_Y(1,0)<4>       uwAVS_RESPONSE(0,8)<4;4,1>
+//    mov (4) uwDEST_Y(4,0)<4>       uwAVS_RESPONSE(0,4)<4;4,1>
+//    mov (4) uwDEST_Y(5,0)<4>       uwAVS_RESPONSE(0,12)<4;4,1>
+//    mov (4) uwDEST_Y(8,0)<4>       uwAVS_RESPONSE(1,0)<4;4,1>
+//    mov (4) uwDEST_Y(9,0)<4>       uwAVS_RESPONSE(1,8)<4;4,1>
+//    mov (4) uwDEST_Y(12,0)<4>      uwAVS_RESPONSE(1,4)<4;4,1>
+//    mov (4) uwDEST_Y(13,0)<4>      uwAVS_RESPONSE(1,12)<4;4,1>
+//    mov (4) uwDEST_Y(16,0)<4>      uwAVS_RESPONSE(8,0)<4;4,1>
+//    mov (4) uwDEST_Y(17,0)<4>      uwAVS_RESPONSE(8,8)<4;4,1>
+//    mov (4) uwDEST_Y(20,0)<4>      uwAVS_RESPONSE(8,4)<4;4,1>
+//    mov (4) uwDEST_Y(21,0)<4>      uwAVS_RESPONSE(8,12)<4;4,1>
+//    mov (4) uwDEST_Y(24,0)<4>      uwAVS_RESPONSE(9,0)<4;4,1>
+//    mov (4) uwDEST_Y(25,0)<4>      uwAVS_RESPONSE(9,8)<4;4,1>
+//    mov (4) uwDEST_Y(28,0)<4>      uwAVS_RESPONSE(9,4)<4;4,1>
+//    mov (4) uwDEST_Y(29,0)<4>      uwAVS_RESPONSE(9,12)<4;4,1>
+//
+//// Move first 8x8 words of A to dest GRF (as packed)
+//    mov (4) uwDEST_Y(0,3)<4>       uwAVS_RESPONSE(6,0)<4;4,1>
+//    mov (4) uwDEST_Y(1,3)<4>       uwAVS_RESPONSE(6,8)<4;4,1>
+//    mov (4) uwDEST_Y(4,3)<4>       uwAVS_RESPONSE(6,4)<4;4,1>
+//    mov (4) uwDEST_Y(5,3)<4>       uwAVS_RESPONSE(6,12)<4;4,1>
+//    mov (4) uwDEST_Y(8,3)<4>       uwAVS_RESPONSE(7,0)<4;4,1>
+//    mov (4) uwDEST_Y(9,3)<4>       uwAVS_RESPONSE(7,8)<4;4,1>
+//    mov (4) uwDEST_Y(12,3)<4>      uwAVS_RESPONSE(7,4)<4;4,1>
+//    mov (4) uwDEST_Y(13,3)<4>      uwAVS_RESPONSE(7,12)<4;4,1>
+//    mov (4) uwDEST_Y(16,3)<4>      uwAVS_RESPONSE(14,0)<4;4,1>
+//    mov (4) uwDEST_Y(17,3)<4>      uwAVS_RESPONSE(14,8)<4;4,1>
+//    mov (4) uwDEST_Y(20,3)<4>      uwAVS_RESPONSE(14,4)<4;4,1>
+//    mov (4) uwDEST_Y(21,3)<4>      uwAVS_RESPONSE(14,12)<4;4,1>
+//    mov (4) uwDEST_Y(24,3)<4>      uwAVS_RESPONSE(15,0)<4;4,1>
+//    mov (4) uwDEST_Y(25,3)<4>      uwAVS_RESPONSE(15,8)<4;4,1>
+//    mov (4) uwDEST_Y(28,3)<4>      uwAVS_RESPONSE(15,4)<4;4,1>
+//    mov (4) uwDEST_Y(29,3)<4>      uwAVS_RESPONSE(15,12)<4;4,1>
+//
+//// Move second 8x8 words of B to dest GRF
+//    mov (4) uwDEST_Y(2,2)<4>       uwAVS_RESPONSE_2(4,0)<4;4,1>
+//    mov (4) uwDEST_Y(3,2)<4>       uwAVS_RESPONSE_2(4,8)<4;4,1>
+//    mov (4) uwDEST_Y(6,2)<4>       uwAVS_RESPONSE_2(4,4)<4;4,1>
+//    mov (4) uwDEST_Y(7,2)<4>       uwAVS_RESPONSE_2(4,12)<4;4,1>
+//    mov (4) uwDEST_Y(10,2)<4>      uwAVS_RESPONSE_2(5,0)<4;4,1>
+//    mov (4) uwDEST_Y(11,2)<4>      uwAVS_RESPONSE_2(5,8)<4;4,1>
+//    mov (4) uwDEST_Y(14,2)<4>      uwAVS_RESPONSE_2(5,4)<4;4,1>
+//    mov (4) uwDEST_Y(15,2)<4>      uwAVS_RESPONSE_2(5,12)<4;4,1>
+//    mov (4) uwDEST_Y(18,2)<4>      uwAVS_RESPONSE_2(12,0)<4;4,1>
+//    mov (4) uwDEST_Y(19,2)<4>      uwAVS_RESPONSE_2(12,8)<4;4,1>
+//    mov (4) uwDEST_Y(22,2)<4>      uwAVS_RESPONSE_2(12,4)<4;4,1>
+//    mov (4) uwDEST_Y(23,2)<4>      uwAVS_RESPONSE_2(12,12)<4;4,1>
+//    mov (4) uwDEST_Y(26,2)<4>      uwAVS_RESPONSE_2(13,0)<4;4,1>
+//    mov (4) uwDEST_Y(27,2)<4>      uwAVS_RESPONSE_2(13,8)<4;4,1>
+//    mov (4) uwDEST_Y(30,2)<4>      uwAVS_RESPONSE_2(13,4)<4;4,1>
+//    mov (4) uwDEST_Y(31,2)<4>      uwAVS_RESPONSE_2(13,12)<4;4,1>
+//
+//// Move second 8x8 words of G to dest GRF
+//    mov (4) uwDEST_Y(2,1)<4>       uwAVS_RESPONSE_2(2,0)<4;4,1>
+//    mov (4) uwDEST_Y(3,1)<4>       uwAVS_RESPONSE_2(2,8)<4;4,1>
+//    mov (4) uwDEST_Y(6,1)<4>       uwAVS_RESPONSE_2(2,4)<4;4,1>
+//    mov (4) uwDEST_Y(7,1)<4>       uwAVS_RESPONSE_2(2,12)<4;4,1>
+//    mov (4) uwDEST_Y(10,1)<4>      uwAVS_RESPONSE_2(3,0)<4;4,1>
+//    mov (4) uwDEST_Y(11,1)<4>      uwAVS_RESPONSE_2(3,8)<4;4,1>
+//    mov (4) uwDEST_Y(14,1)<4>      uwAVS_RESPONSE_2(3,4)<4;4,1>
+//    mov (4) uwDEST_Y(15,1)<4>      uwAVS_RESPONSE_2(3,12)<4;4,1>
+//    mov (4) uwDEST_Y(18,1)<4>      uwAVS_RESPONSE_2(10,0)<4;4,1>
+//    mov (4) uwDEST_Y(19,1)<4>      uwAVS_RESPONSE_2(10,8)<4;4,1>
+//    mov (4) uwDEST_Y(22,1)<4>      uwAVS_RESPONSE_2(10,4)<4;4,1>
+//    mov (4) uwDEST_Y(23,1)<4>      uwAVS_RESPONSE_2(10,12)<4;4,1>
+//    mov (4) uwDEST_Y(26,1)<4>      uwAVS_RESPONSE_2(11,0)<4;4,1>
+//    mov (4) uwDEST_Y(27,1)<4>      uwAVS_RESPONSE_2(11,8)<4;4,1>
+//    mov (4) uwDEST_Y(30,1)<4>      uwAVS_RESPONSE_2(11,4)<4;4,1>
+//    mov (4) uwDEST_Y(31,1)<4>      uwAVS_RESPONSE_2(11,12)<4;4,1>
+//
+//// Move second 8x8 words of R to dest GRF
+//    mov (4) uwDEST_Y(2,0)<4>       uwAVS_RESPONSE_2(0,0)<4;4,1>
+//    mov (4) uwDEST_Y(3,0)<4>       uwAVS_RESPONSE_2(0,8)<4;4,1>
+//    mov (4) uwDEST_Y(6,0)<4>       uwAVS_RESPONSE_2(0,4)<4;4,1>
+//    mov (4) uwDEST_Y(7,0)<4>       uwAVS_RESPONSE_2(0,12)<4;4,1>
+//    mov (4) uwDEST_Y(10,0)<4>      uwAVS_RESPONSE_2(1,0)<4;4,1>
+//    mov (4) uwDEST_Y(11,0)<4>      uwAVS_RESPONSE_2(1,8)<4;4,1>
+//    mov (4) uwDEST_Y(14,0)<4>      uwAVS_RESPONSE_2(1,4)<4;4,1>
+//    mov (4) uwDEST_Y(15,0)<4>      uwAVS_RESPONSE_2(1,12)<4;4,1>
+//    mov (4) uwDEST_Y(18,0)<4>      uwAVS_RESPONSE_2(8,0)<4;4,1>
+//    mov (4) uwDEST_Y(19,0)<4>      uwAVS_RESPONSE_2(8,8)<4;4,1>
+//    mov (4) uwDEST_Y(22,0)<4>      uwAVS_RESPONSE_2(8,4)<4;4,1>
+//    mov (4) uwDEST_Y(23,0)<4>      uwAVS_RESPONSE_2(8,12)<4;4,1>
+//    mov (4) uwDEST_Y(26,0)<4>      uwAVS_RESPONSE_2(9,0)<4;4,1>
+//    mov (4) uwDEST_Y(27,0)<4>      uwAVS_RESPONSE_2(9,8)<4;4,1>
+//    mov (4) uwDEST_Y(30,0)<4>      uwAVS_RESPONSE_2(9,4)<4;4,1>
+//    mov (4) uwDEST_Y(31,0)<4>      uwAVS_RESPONSE_2(9,12)<4;4,1>
+//
+//// Move second 8x8 words of A to dest GRF
+//    mov (4) uwDEST_Y(2,3)<4>       uwAVS_RESPONSE_2(6,0)<4;4,1>
+//    mov (4) uwDEST_Y(3,3)<4>       uwAVS_RESPONSE_2(6,8)<4;4,1>
+//    mov (4) uwDEST_Y(6,3)<4>       uwAVS_RESPONSE_2(6,4)<4;4,1>
+//    mov (4) uwDEST_Y(7,3)<4>       uwAVS_RESPONSE_2(6,12)<4;4,1>
+//    mov (4) uwDEST_Y(10,3)<4>      uwAVS_RESPONSE_2(7,0)<4;4,1>
+//    mov (4) uwDEST_Y(11,3)<4>      uwAVS_RESPONSE_2(7,8)<4;4,1>
+//    mov (4) uwDEST_Y(14,3)<4>      uwAVS_RESPONSE_2(7,4)<4;4,1>
+//    mov (4) uwDEST_Y(15,3)<4>      uwAVS_RESPONSE_2(7,12)<4;4,1>
+//    mov (4) uwDEST_Y(18,3)<4>      uwAVS_RESPONSE_2(14,0)<4;4,1>
+//    mov (4) uwDEST_Y(19,3)<4>      uwAVS_RESPONSE_2(14,8)<4;4,1>
+//    mov (4) uwDEST_Y(22,3)<4>      uwAVS_RESPONSE_2(14,4)<4;4,1>
+//    mov (4) uwDEST_Y(23,3)<4>      uwAVS_RESPONSE_2(14,12)<4;4,1>
+//    mov (4) uwDEST_Y(26,3)<4>      uwAVS_RESPONSE_2(15,0)<4;4,1>
+//    mov (4) uwDEST_Y(27,3)<4>      uwAVS_RESPONSE_2(15,8)<4;4,1>
+//    mov (4) uwDEST_Y(30,3)<4>      uwAVS_RESPONSE_2(15,4)<4;4,1>
+//    mov (4) uwDEST_Y(31,3)<4>      uwAVS_RESPONSE_2(15,12)<4;4,1>
+#else   /* OUTPUT_8_BIT */
+// Move first 8x8 words of B to dest GRF
+    mov (8) DEST_B(0)<1>                                ubAVS_RESPONSE(4,1)<16;4,2>
+    mov (8) DEST_B(1)<1>                                ubAVS_RESPONSE(4,8+1)<16;4,2>
+    mov (8) DEST_B(2)<1>                                ubAVS_RESPONSE(5,1)<16;4,2>
+    mov (8) DEST_B(3)<1>                                ubAVS_RESPONSE(5,8+1)<16;4,2>
+    mov (8) DEST_B(4)<1>                                ubAVS_RESPONSE(12,1)<16;4,2>
+    mov (8) DEST_B(5)<1>                                ubAVS_RESPONSE(12,8+1)<16;4,2>
+    mov (8) DEST_B(6)<1>                                ubAVS_RESPONSE(13,1)<16;4,2>
+    mov (8) DEST_B(7)<1>                                ubAVS_RESPONSE(13,8+1)<16;4,2>
+// Move first 8x8 words of G to dest GRF
+    mov (8) DEST_G(0)<1>                                ubAVS_RESPONSE(2,1)<16;4,2>
+    mov (8) DEST_G(1)<1>                                ubAVS_RESPONSE(2,8+1)<16;4,2>
+    mov (8) DEST_G(2)<1>                                ubAVS_RESPONSE(3,1)<16;4,2>
+    mov (8) DEST_G(3)<1>                                ubAVS_RESPONSE(3,8+1)<16;4,2>
+    mov (8) DEST_G(4)<1>                                ubAVS_RESPONSE(10,1)<16;4,2>
+    mov (8) DEST_G(5)<1>                                ubAVS_RESPONSE(10,8+1)<16;4,2>
+    mov (8) DEST_G(6)<1>                                ubAVS_RESPONSE(11,1)<16;4,2>
+    mov (8) DEST_G(7)<1>                                ubAVS_RESPONSE(11,8+1)<16;4,2>
+// Move first 8x8 words of R to dest GRF
+    mov (8) DEST_R(0)<1>                                ubAVS_RESPONSE(0,1)<16;4,2>
+    mov (8) DEST_R(1)<1>                                ubAVS_RESPONSE(0,8+1)<16;4,2>
+    mov (8) DEST_R(2)<1>                                ubAVS_RESPONSE(1,1)<16;4,2>
+    mov (8) DEST_R(3)<1>                                ubAVS_RESPONSE(1,8+1)<16;4,2>
+    mov (8) DEST_R(4)<1>                                ubAVS_RESPONSE(8,1)<16;4,2>
+    mov (8) DEST_R(5)<1>                                ubAVS_RESPONSE(8,8+1)<16;4,2>
+    mov (8) DEST_R(6)<1>                                ubAVS_RESPONSE(9,1)<16;4,2>
+    mov (8) DEST_R(7)<1>                                ubAVS_RESPONSE(9,8+1)<16;4,2>
+// Move first 8x8 words of A to dest GRF
+    mov (8) DEST_A(0)<1>                                ubAVS_RESPONSE(6,1)<16;4,2>
+    mov (8) DEST_A(1)<1>                                ubAVS_RESPONSE(6,8+1)<16;4,2>
+    mov (8) DEST_A(2)<1>                                ubAVS_RESPONSE(7,1)<16;4,2>
+    mov (8) DEST_A(3)<1>                                ubAVS_RESPONSE(7,8+1)<16;4,2>
+    mov (8) DEST_A(4)<1>                                ubAVS_RESPONSE(14,1)<16;4,2>
+    mov (8) DEST_A(5)<1>                                ubAVS_RESPONSE(14,8+1)<16;4,2>
+    mov (8) DEST_A(6)<1>                                ubAVS_RESPONSE(15,1)<16;4,2>
+    mov (8) DEST_A(7)<1>                                ubAVS_RESPONSE(15,8+1)<16;4,2>
+// Move second 8x8 words of B to dest GRF
+    mov (8) DEST_B(0,8)<1>                      ubAVS_RESPONSE_2(4,1)<16;4,2>
+    mov (8) DEST_B(1,8)<1>                      ubAVS_RESPONSE_2(4,8+1)<16;4,2>
+    mov (8) DEST_B(2,8)<1>                      ubAVS_RESPONSE_2(5,1)<16;4,2>
+    mov (8) DEST_B(3,8)<1>                      ubAVS_RESPONSE_2(5,8+1)<16;4,2>
+    mov (8) DEST_B(4,8)<1>                      ubAVS_RESPONSE_2(12,1)<16;4,2>
+    mov (8) DEST_B(5,8)<1>                      ubAVS_RESPONSE_2(12,8+1)<16;4,2>
+    mov (8) DEST_B(6,8)<1>                      ubAVS_RESPONSE_2(13,1)<16;4,2>
+    mov (8) DEST_B(7,8)<1>                      ubAVS_RESPONSE_2(13,8+1)<16;4,2>
+// Move second 8x8 words of G to dest GRF
+    mov (8) DEST_G(0,8)<1>                      ubAVS_RESPONSE_2(2,1)<16;4,2>
+    mov (8) DEST_G(1,8)<1>                      ubAVS_RESPONSE_2(2,8+1)<16;4,2>
+    mov (8) DEST_G(2,8)<1>                      ubAVS_RESPONSE_2(3,1)<16;4,2>
+    mov (8) DEST_G(3,8)<1>                      ubAVS_RESPONSE_2(3,8+1)<16;4,2>
+    mov (8) DEST_G(4,8)<1>                      ubAVS_RESPONSE_2(10,1)<16;4,2>
+    mov (8) DEST_G(5,8)<1>                      ubAVS_RESPONSE_2(10,8+1)<16;4,2>
+    mov (8) DEST_G(6,8)<1>                      ubAVS_RESPONSE_2(11,1)<16;4,2>
+    mov (8) DEST_G(7,8)<1>                      ubAVS_RESPONSE_2(11,8+1)<16;4,2>
+// Move second 8x8 words of R to dest GRF
+    mov (8) DEST_R(0,8)<1>                      ubAVS_RESPONSE_2(0,1)<16;4,2>
+    mov (8) DEST_R(1,8)<1>                      ubAVS_RESPONSE_2(0,8+1)<16;4,2>
+    mov (8) DEST_R(2,8)<1>                      ubAVS_RESPONSE_2(1,1)<16;4,2>
+    mov (8) DEST_R(3,8)<1>                      ubAVS_RESPONSE_2(1,8+1)<16;4,2>
+    mov (8) DEST_R(4,8)<1>                      ubAVS_RESPONSE_2(8,1)<16;4,2>
+    mov (8) DEST_R(5,8)<1>                      ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) DEST_R(6,8)<1>                      ubAVS_RESPONSE_2(9,1)<16;4,2>
+    mov (8) DEST_R(7,8)<1>                      ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+// Move second 8x8 words of A to dest GRF
+    mov (8) DEST_A(0,8)<1>                      ubAVS_RESPONSE_2(6,1)<16;4,2>
+    mov (8) DEST_A(1,8)<1>                      ubAVS_RESPONSE_2(6,8+1)<16;4,2>
+    mov (8) DEST_A(2,8)<1>                      ubAVS_RESPONSE_2(7,1)<16;4,2>
+    mov (8) DEST_A(3,8)<1>                      ubAVS_RESPONSE_2(7,8+1)<16;4,2>
+    mov (8) DEST_A(4,8)<1>                      ubAVS_RESPONSE_2(14,1)<16;4,2>
+    mov (8) DEST_A(5,8)<1>                      ubAVS_RESPONSE_2(14,8+1)<16;4,2>
+    mov (8) DEST_A(6,8)<1>                      ubAVS_RESPONSE_2(15,1)<16;4,2>
+    mov (8) DEST_A(7,8)<1>                      ubAVS_RESPONSE_2(15,8+1)<16;4,2>
+#endif
+//------------------------------------------------------------------------------
+    // Set to write bottom region to memory
+    #define SRC_REGION                              REGION_1
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/RGB_Scaling.asm
 ,0 → 1,72
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- RGB_Scaling.asm ----------
+#include "Scaling.inc"
+        // Build 16 elements ramp in float32 and normalized it
+//      mov (8)         SAMPLER_RAMP(0)<1>              0x76543210:v
+//      add     (8)             SAMPLER_RAMP(1)<1>              SAMPLER_RAMP(0) 8.0:f
+mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf                //3, 2, 1, 0 in float vector
+mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf      //7, 6, 5, 4 in float vector
+add     (8)             SAMPLER_RAMP(1)<1>              SAMPLER_RAMP(0) 8.0:f
+//Module: PrepareScaleCoord.asm
+        // Setup for sampler msg hdr
+    mov (2)             rMSGSRC.0<1>:ud                 0:ud                                            { NoDDClr }     // Unused fields
+    mov (1)             rMSGSRC.2<1>:ud                 0:ud                                            { NoDDChk }     // Write and offset
+        // Calculate 16 v based on the step Y and vertical origin
+        mov     (16)    mfMSGPAYLOAD(2)<1>              fSRC_VID_V_ORI<0;1,0>:f
+        mov     (16)    SCALE_COORD_Y<1>:f              fSRC_VID_V_ORI<0;1,0>:f
+        // Calculate 16 u based on the step X and hori origin
+//      line (16)       mfMSGPAYLOAD(0)<1>              SCALE_STEP_X<0;1,0>:f           SAMPLER_RAMP(0)         // Assign to mrf directly
+        mov     (16)    acc0:f                                                  fSRC_VID_H_ORI<0;1,0>:f                                                                                 { Compr }
+        mac     (16)    mfMSGPAYLOAD(0)<1>      fVIDEO_STEP_X<0;1,0>:f  SAMPLER_RAMP(0)                 { Compr }
+        //Setup the constants for line instruction
+        mov     (1)             SCALE_LINE_P255<1>:f            255.0:f                         { NoDDClr }     //{ NoDDClr, NoDDChk }
+        mov     (1)             SCALE_LINE_P0_5<1>:f            0.5:f                           { NoDDChk }
+//------------------------------------------------------------------------------
+$for (0; <nY_NUM_OF_ROWS; 1) {
+        // Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
+  mov (8)       MSGHDR_SCALE.0:ud      rMSGSRC.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+        send (16)       SCALE_RESPONSE_YW(0)<1>         MSGHDR_SCALE    udDUMMY_NULL    nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_RGB+nBI_CURRENT_SRC_RGB
+        // Calculate 16 v for next line
+        add (16)        mfMSGPAYLOAD(2)<1>              SCALE_COORD_Y<8;8,1>:f          fVIDEO_STEP_Y<0;1,0>:f  // Assign to mrf directly
+        add (16)        SCALE_COORD_Y<1>:f              SCALE_COORD_Y<8;8,1>:f          fVIDEO_STEP_Y<0;1,0>:f  // Assign to mrf directly
+        // Scale back to [0, 255], convert f to ud
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_YF(0)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_YD(0)<1>       acc0:f                                                                                                          { Compr }
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_YF(2)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_YD(2)<1>       acc0:f                                                                                                          { Compr }
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_YF(4)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_YD(4)<1>       acc0:f                                                                                                          { Compr }
+//#if defined(SAVE_ARGB)        //Only needed if Alpha value is written to the destination
+        line (16)       acc0:f          SCALE_LINE_P255<0;1,0>:f        SCALE_RESPONSE_YF(6)    { Compr }                       // Process B, V
+        mov  (16) SCALE_RESPONSE_YD(6)<1>       acc0:f                                                                                                          { Compr }
+//#endif
+        mov      (16)   DEST_R(%1)<1>                           SCALE_RESPONSE_YB(0)                                                                                    //possible error due to truncation - vK
+        mov      (16)   DEST_G(%1)<1>                           SCALE_RESPONSE_YB(2)                                                                                    //possible error due to truncation - vK
+        mov      (16)   DEST_B(%1)<1>                           SCALE_RESPONSE_YB(4)                                                                                    //possible error due to truncation - vK
+        mov      (16)   DEST_A(%1)<1>                           SCALE_RESPONSE_YB(6)                                                                                    //possible error due to truncation - vK
+}

 /drivers/video/i965/shaders/post_processing/gen5_6/Core_Kernels/Scaling.inc
 ,0 → 1,75
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// File name: Scaling.inc
+#ifndef _SCALING_INC_
+#define _SCALING_INC_
+// Local variables----------------------------------------------------------------------------------
+#define MSGHDR_SCALE            m1              // Message Payload Header (Uses m2, m3, m4, m5 implicitly)
+//--------------------------------------------------------------------------------------------------
+//r10.0 thru r33.0; Primary surface read from sampler (16x8)
+#define DEST_Y                  uwTOP_Y
+#define DEST_U                  uwTOP_U
+#define DEST_V                  uwTOP_V
+//r10.0 thru r41.0
+.declare DEST_B         Base=REG(r,10)  ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_G         Base=REG(r,18)  ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_R         Base=REG(r,26)  ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_A         Base=REG(r,34)  ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+//r56.0 thru r79.0
+.declare        SCALE_RESPONSE_YF       Base=REG(r,nBOT_Y)      ElementSize=4 SrcRegion=REGION(8,1) Type=f
+.declare        SCALE_RESPONSE_UF       Base=REG(r,nBOT_U)      ElementSize=4 SrcRegion=REGION(8,1) Type=f
+.declare        SCALE_RESPONSE_VF       Base=REG(r,nBOT_V)      ElementSize=4 SrcRegion=REGION(8,1) Type=f
+.declare        SCALE_RESPONSE_YW       Base=REG(r,nBOT_Y)      ElementSize=2 SrcRegion=REGION(16,1) Type=uw
+.declare        SCALE_RESPONSE_UW       Base=REG(r,nBOT_U)      ElementSize=2 SrcRegion=REGION(16,1) Type=uw
+.declare        SCALE_RESPONSE_VW       Base=REG(r,nBOT_V)      ElementSize=2 SrcRegion=REGION(16,1) Type=uw
+.declare        SCALE_RESPONSE_YD       Base=REG(r,nBOT_Y)      ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+.declare        SCALE_RESPONSE_UD       Base=REG(r,nBOT_U)      ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+.declare        SCALE_RESPONSE_VD       Base=REG(r,nBOT_V)      ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+.declare        SCALE_RESPONSE_YB       Base=REG(r,nBOT_Y)      ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+.declare        SCALE_RESPONSE_UB       Base=REG(r,nBOT_U)      ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+.declare        SCALE_RESPONSE_VB       Base=REG(r,nBOT_V)      ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+.declare        SAMPLER_RAMP    Base=REG(r,42) ElementSize=4 SrcRegion=<8;8,1> Type=f   // 2 GRFs, 16 elements
+//#define       SCALE_STEP_X    REG2(r,43,0)
+//#define       SCALE_COORD_X   REG2(r,43,3)
+#define SCALE_LINE_P255                 REG2(r,43,4)    // = 255.0      Used in 'line' inst to multiply 255, add 0.5, and round to int.
+#define SCALE_LINE_P0_5                 REG2(r,43,7)    // = 0.5
+//r44.0 thru r45.0
+#define SCALE_COORD_Y           REG(r,44)       //2GRF
+// Send Message [DevILK]                                Message Descriptor
+//  MBZ MsgL=5 MsgR=8                            H MBZ   SIMD     MsgType   SmplrIndx BindTab
+//  000 0 101 0 1000                             1  0     10     0000         0000    00000000
+//    0     A    8                                     A             0             0     0     0
+//     MsgL=1+2*2(u,v)=5 MsgR=8
+#define SMPLR_MSG_DSC           0x0A8A0000      // ILK Sampler Message Descriptor
+// Re-define new number of lines
+#undef nY_NUM_OF_ROWS
+#undef nUV_NUM_OF_ROWS
+#define nY_NUM_OF_ROWS      8
+#define nUV_NUM_OF_ROWS     8
+#endif  //_SCALING_INC_

Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 3768 → Rev 3769