Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4680 | right-hear | 1 | /* |
2 | * ARM specific render optims live here |
||
3 | */ |
||
4 | |||
5 | #include "fitz.h" |
||
6 | |||
7 | typedef unsigned char byte; |
||
8 | |||
9 | /* always surround cpu specific code with HAVE_XXX */ |
||
10 | #ifdef ARCH_ARM |
||
11 | |||
12 | /* from imagescalearm.s */ |
||
13 | extern void fz_srow4_arm(byte *src, byte *dst, int w, int denom); |
||
14 | extern void fz_scol4_arm(byte *src, byte *dst, int w, int denom); |
||
15 | |||
16 | static void |
||
17 | path_w4i1o4_arm(byte * restrict rgba, byte * restrict src, byte cov, int len, byte * restrict dst) |
||
18 | { |
||
19 | /* The ARM code here is a hand coded implementation of the optimized C version. */ |
||
20 | |||
21 | if (len <= 0) |
||
22 | return; |
||
23 | |||
24 | asm volatile( |
||
25 | "ldr %0, [%0] @ %0 = rgba \n" |
||
26 | "mov r11,#0 \n" |
||
27 | "mov r8, #0xFF00 \n" |
||
28 | "mov r14,%0,lsr #24 @ r14= alpha \n" |
||
29 | "orr %0, %0, #0xFF000000 @ %0 = rgba |= 0xFF000000 \n" |
||
30 | "orr r8, r8, r8, LSL #16 @ r8 = 0xFF00FF00 \n" |
||
31 | "adds r14,r14,r14,LSR #7 @ r14 = alpha += alpha>>7 \n" |
||
32 | "beq 9f @ if (alpha == 0) bale \n" |
||
33 | "and r6, %0, r8 @ r6 = ga<<8 \n" |
||
34 | "bic %0, %0, r8 @ %0 = rb \n" |
||
35 | "mov r6, r6, LSR #8 @ r6 = ga \n" |
||
36 | "cmp r14,#256 @ if (alpha == 256) \n" |
||
37 | "beq 4f @ no-alpha loop \n" |
||
38 | "B 2f @ enter the loop \n" |
||
39 | "1: @ Loop used for when coverage*alpha == 0 \n" |
||
40 | "subs %3, %3, #1 @ len-- \n" |
||
41 | "ble 9f \n" |
||
42 | "2: \n" |
||
43 | "ldrb r12,[%1] @ r12= *src \n" |
||
44 | "ldr r9, [%4], #4 @ r9 = drb = *dst32++ \n" |
||
45 | "strb r11,[%1], #1 @ r11= *src++ = 0 \n" |
||
46 | "add %2, r12, %2 @ %2 = cov += r12 \n" |
||
47 | "ands %2, %2, #255 @ %2 = cov &= 255 \n" |
||
48 | "beq 1b @ if coverage == 0 loop back \n" |
||
49 | "add r10,%2, %2, LSR #7 @ r10= ca = cov+(cov>>7) \n" |
||
50 | "mul r10,r14,r10 @ r10= ca *= alpha \n" |
||
51 | "and r7, r8, r9 @ r7 = dga = drb & MASK \n" |
||
52 | "mov r10,r10,LSR #8 @ r10= ca >>= 8 \n" |
||
53 | "and r9, r8, r9, LSL #8 @ r9 = drb = (drb<<8) & MASK \n" |
||
54 | "sub r12,r6, r7, LSR #8 @ r12= cga = ga - (dga>>8) \n" |
||
55 | "sub r5, %0, r9, LSR #8 @ r5 = crb = rb - (drb>>8) \n" |
||
56 | "mla r7, r12,r10,r7 @ r7 = dga += cga * ca \n" |
||
57 | "subs %3, %3, #1 @ len-- \n" |
||
58 | "mla r9, r5, r10,r9 @ r9 = drb += crb * ca \n" |
||
59 | "and r7, r8, r7 @ r7 = dga &= MASK \n" |
||
60 | "and r9, r8, r9 @ r9 = drb &= MASK \n" |
||
61 | "orr r9, r7, r9, LSR #8 @ r9 = drb = dga | (drb>>8) \n" |
||
62 | "str r9, [%4, #-4] @ dst32[-1] = r9 \n" |
||
63 | "bgt 2b \n" |
||
64 | "b 9f \n" |
||
65 | "@ --- Solid alpha loop --------------------------------------- \n" |
||
66 | "3: @ Loop used when coverage == 256 \n" |
||
67 | "orr r9, %0, r6, LSL #8 @ r9 = rgba \n" |
||
68 | "str r9, [%4, #-4] @ dst32[-1] = r9 \n" |
||
69 | "4: @ Loop used for when coverage*alpha == 0 \n" |
||
70 | "subs %3, %3, #1 @ len-- \n" |
||
71 | "ble 9f \n" |
||
72 | "5: \n" |
||
73 | "ldrb r12,[%1] @ r12= *src \n" |
||
74 | "ldr r9, [%4], #4 @ r9 = drb = *dst32++ \n" |
||
75 | "strb r11,[%1], #1 @ r11= *src++ = 0 \n" |
||
76 | "add %2, r12, %2 @ %2 = cov += r12 \n" |
||
77 | "ands %2, %2, #255 @ %2 = cov &= 255 \n" |
||
78 | "beq 4b @ if coverage == 0 loop back \n" |
||
79 | "cmp %2, #255 @ if coverage == solid \n" |
||
80 | "beq 3b @ loop back \n" |
||
81 | "add r10,%2, %2, LSR #7 @ r10= ca = cov+(cov>>7) \n" |
||
82 | "and r7, r8, r9 @ r7 = dga = drb & MASK \n" |
||
83 | "and r9, r8, r9, LSL #8 @ r9 = dga = (drb<<8) & MASK \n" |
||
84 | "sub r12,r6, r7, LSR #8 @ r12= cga = ga - (dga>>8) \n" |
||
85 | "sub r5, %0, r9, LSR #8 @ r5 = crb = rb - (drb>>8) \n" |
||
86 | "mla r7, r12,r10,r7 @ r7 = dga += cga * ca \n" |
||
87 | "subs %3, %3, #1 @ len-- \n" |
||
88 | "mla r9, r5, r10,r9 @ r9 = drb += crb * ca \n" |
||
89 | "and r7, r8, r7 @ r7 = dga &= MASK \n" |
||
90 | "and r9, r8, r9 @ r9 = drb &= MASK \n" |
||
91 | "orr r9, r7, r9, LSR #8 @ r9 = drb = dga | (drb>>8) \n" |
||
92 | "str r9, [%4, #-4] @ dst32[-1] = r9 \n" |
||
93 | "bgt 5b \n" |
||
94 | "9: @ End \n" |
||
95 | : |
||
96 | "+r" (rgba), |
||
97 | "+r" (src), |
||
98 | "+r" (cov), |
||
99 | "+r" (len), |
||
100 | "+r" (dst) |
||
101 | : |
||
102 | : |
||
103 | "r5","r6","r7","r8","r9","r10","r11","r12","r14","memory","cc" |
||
104 | ); |
||
105 | } |
||
106 | |||
107 | static void load_tile8_arm(byte * restrict src, int sw, byte * restrict dst, int dw, int w, int h, int pad) |
||
108 | { |
||
109 | if ((h == 0) || (w == 0)) |
||
110 | return; |
||
111 | |||
112 | switch (pad) |
||
113 | { |
||
114 | case 0: |
||
115 | while (h--) |
||
116 | { |
||
117 | memcpy(dst, src, w); |
||
118 | src += sw; |
||
119 | dst += dw; |
||
120 | } |
||
121 | break; |
||
122 | |||
123 | case 1: |
||
124 | sw -= w; |
||
125 | dw -= w<<1; |
||
126 | asm volatile( |
||
127 | "MOV r11,#255 \n" |
||
128 | "1: \n" |
||
129 | "MOV r5, %[w] @ r5 = x = w \n" |
||
130 | "2: \n" |
||
131 | "LDRB r4, [%[src]], #1 @ r4 = *src++ \n" |
||
132 | "SUBS r5, r5, #1 \n" |
||
133 | "STRB r4, [%[dst]], #1 @ *dst++ = r4 \n" |
||
134 | "STRB r11,[%[dst]], #1 @ *dst++ = 255 \n" |
||
135 | "BGT 2b \n" |
||
136 | "ADD %[src],%[src],%[sw] @ src += sw \n" |
||
137 | "ADD %[dst],%[dst],%[dw] @ dst += dw \n" |
||
138 | "SUBS %[h],%[h],#1 \n" |
||
139 | "BGT 1b \n" |
||
140 | : |
||
141 | [src] "+r" (src), |
||
142 | [sw] "+r" (sw), |
||
143 | [dst] "+r" (dst), |
||
144 | [dw] "+r" (dw), |
||
145 | [h] "+r" (h), |
||
146 | [w] "+r" (w) |
||
147 | : |
||
148 | : |
||
149 | "r4","r5","r11","memory","cc" |
||
150 | ); |
||
151 | break; |
||
152 | |||
153 | case 3: |
||
154 | sw -= w; |
||
155 | asm volatile( |
||
156 | "MOV r11,#255 \n" |
||
157 | "1: \n" |
||
158 | "MOV r5, %[w] @ r5 = x = w \n" |
||
159 | "MOV r8, %[dst] @ r8 = dp = dst \n" |
||
160 | "2: \n" |
||
161 | "LDRB r4, [%[src]], #1 @ r4 = *src++ \n" |
||
162 | "LDRB r6, [%[src]], #1 @ r6 = *src++ \n" |
||
163 | "LDRB r7, [%[src]], #1 @ r7 = *src++ \n" |
||
164 | "SUBS r5, r5, #3 \n" |
||
165 | "STRB r4, [r8], #1 @ *dp++ = r4 \n" |
||
166 | "STRB r6, [r8], #1 @ *dp++ = r6 \n" |
||
167 | "STRB r7, [r8], #1 @ *dp++ = r7 \n" |
||
168 | "STRB r11,[r8], #1 @ *dp++ = 255 \n" |
||
169 | "BGT 2b \n" |
||
170 | "ADD %[src],%[src],%[sw] @ src += sw \n" |
||
171 | "ADD %[dst],%[dst],%[dw] @ dst += dw \n" |
||
172 | "SUBS %[h],%[h],#1 \n" |
||
173 | "BGT 1b \n" |
||
174 | : |
||
175 | [src] "+r" (src), |
||
176 | [sw] "+r" (sw), |
||
177 | [dst] "+r" (dst), |
||
178 | [dw] "+r" (dw), |
||
179 | [h] "+r" (h), |
||
180 | [w] "+r" (w) |
||
181 | : |
||
182 | : |
||
183 | "r4","r5","r6","r7","r8","r11","memory","cc" |
||
184 | ); |
||
185 | break; |
||
186 | |||
187 | default: |
||
188 | sw -= w; |
||
189 | asm volatile( |
||
190 | "mov r9,#255 \n" |
||
191 | "1: \n" |
||
192 | "mov r7, %[dst] @ r7 = dp = dst \n" |
||
193 | "mov r8, #1 @ r8 = tpad = 1 \n" |
||
194 | "mov r14,%[w] @ r11= x = w \n" |
||
195 | "2: \n" |
||
196 | "ldrb r10,[%[src]],#1 \n" |
||
197 | "subs r8, r8, #1 \n" |
||
198 | "moveq r8, %[pad] \n" |
||
199 | "streqb r9, [r7], #1 \n" |
||
200 | "strb r10,[r7], #1 \n" |
||
201 | "subs r14,r14, #1 \n" |
||
202 | "bgt 2b \n" |
||
203 | "add %[src],%[src],%[sw] \n" |
||
204 | "add %[dst],%[dst],%[dw] \n" |
||
205 | "subs %[h], %[h], #1 \n" |
||
206 | "bgt 1b \n" |
||
207 | : |
||
208 | [src] "+r" (src), |
||
209 | [sw] "+r" (sw), |
||
210 | [dst] "+r" (dst), |
||
211 | [dw] "+r" (dw), |
||
212 | [h] "+r" (h), |
||
213 | [w] "+r" (w), |
||
214 | [pad] "+r" (pad) |
||
215 | : |
||
216 | : |
||
217 | "r7","r8","r9","r10","r14","memory","cc" |
||
218 | ); |
||
219 | break; |
||
220 | } |
||
221 | } |
||
222 | |||
223 | void |
||
224 | fz_accelerate_arch(void) |
||
225 | { |
||
226 | fz_path_w4i1o4 = path_w4i1o4_arm; |
||
227 | fz_loadtile8 = load_tile8_arm; |
||
228 | fz_srow4 = fz_srow4_arm; |
||
229 | fz_scol4 = fz_scol4_arm; |
||
230 | } |
||
231 | |||
232 | #endif1; |