1 /**
2   Blitter for painting angular gradients.
3 
4   Copyright: Chris Jones
5   License: Boost Software License, Version 1.0
6   Authors: Chris Jones
7 */
8 
9 module dg2d.angularblit;
10 
11 import dg2d.rasterizer;
12 import dg2d.gradient;
13 import dg2d.misc;
14 import dg2d.blitex;
15 
16 /**
17    Angular gradient blitter struct.
18 
19    ---
20    auto ablit = AngularBlit(m_pixels,m_stride,m_height);
21    ablit.setPaint(grad, wr, RepeatMode.Mirror, 4.0f);
22    ablit.setElipse(x0,y0,x1,y1,x2,y2);
23    rasterizer.rasterize(ablit.getBlitFunc);
24    ---
25 */
26 
27 struct AngularBlit
28 {
29     /** Construct an Angular blitter.
30     pixels - pointer to a 32 bpp pixel buffer
31     stride - buffer width in pixels
32     height - buffer heigth in pixels
33 
34     note: buffer must be 16 byte aligned, stride must be multiple of 4
35     */
36 
37     this(uint* pixels, int stride, int height)
38     {
39         assert(((cast(uint)pixels) & 15) == 0); // must be 16 byte aligned
40         assert((stride & 3) == 0);              // stride must be 16 byte aligned
41         assert(height > 0);
42         this.pixels = pixels;
43         this.stride = stride;
44         this.height = height;
45     }
46 
47     /** set the gradient, winding rule and repeat mode. "numRepeats" sets how many times
48     the gradient repeats in 360 degrees.
49     */
50 
51     void setPaint(Gradient grad, WindingRule wrule, RepeatMode rmode, float numRepeats)
52     {
53         assert(grad !is null);
54         assert(isPow2(grad.lookupLength));
55         gradient = grad;
56         windingRule = wrule;
57         repeatMode = rmode;
58         this.numRepeats = numRepeats;
59     }
60 
61     /** Specifiy the orientation in terms of an elipse, for that we need 3 points...
62     (x0,y0) is the center of the elipse
63     (x1,y1) is radius at 0 degrees
64     (x2,y2) is radius at 90 degrees
65     The radii dont need to be at right angles, so it can handle elipse that has been
66     though any affine transform.
67     */
68 
69     void setCoords(float x0, float y0, float x1, float y1, float x2, float y2)
70     {
71         xctr = x0;
72         yctr = y0;
73         float w0 = x1-x0;
74         float h0 = y1-y0;
75         float w1 = x2-x0;
76         float h1 = y2-y0;
77         float q = w1*h0 - w0*h1;
78         if (abs(q) < 0.1) q = (q < 0) ? -0.1 : 0.1;
79         xstep0 = -h1 / q;
80         ystep0 = w1 / q;
81         xstep1 = h0 / q;
82         ystep1 = -w0 / q;
83     }
84 
85     /** Specifiy the orientation in terms of an circle, for that we need two points,
86     (x0,y0) is the center of the circle
87     (x1,y1) is radius at 0 degrees
88     */
89 
90     void setCoords(float x0, float y0, float x1, float y1)
91     {
92         setCoords(x0,y0,x1,y1,x0-y1+y0,y0+x1-x0);
93     }
94 
95     /** returns a BlitFunc for use by the rasterizer */
96 
97     BlitFunc getBlitFunc() return
98     {
99         if (windingRule == WindingRule.NonZero)
100         {
101             switch(repeatMode)
102             {
103                 case RepeatMode.Pad: return &angular_blit!(WindingRule.NonZero,RepeatMode.Pad);
104                 case RepeatMode.Repeat: return &angular_blit!(WindingRule.NonZero,RepeatMode.Repeat);
105                 case RepeatMode.Mirror: return &angular_blit!(WindingRule.NonZero,RepeatMode.Mirror);
106                 default: assert(0);
107             }
108         }
109         else
110         {
111             switch(repeatMode)
112             {
113                 case RepeatMode.Pad: return &angular_blit!(WindingRule.EvenOdd,RepeatMode.Pad);
114                 case RepeatMode.Repeat: return &angular_blit!(WindingRule.EvenOdd,RepeatMode.Repeat);
115                 case RepeatMode.Mirror: return &angular_blit!(WindingRule.EvenOdd,RepeatMode.Mirror);
116                 default: assert(0);
117             }
118         }
119     }
120 
121 private:
122 
123     void angular_blit(WindingRule wr, RepeatMode mode)(int* delta, DMWord* mask, int x0, int x1, int y)
124     {
125         assert(x0 >= 0);
126         assert(x1 <= stride);
127         assert(y >= 0);
128         assert(y < height);
129         assert((x0 & 3) == 0);
130         assert((x1 & 3) == 0);
131 
132         // main blit variables
133 
134         int bpos = x0 / 4;
135         int endbit = x1 / 4;
136         uint* dest = &pixels[y*stride];
137         __m128i xmWinding = 0;
138         uint* lut = gradient.getLookup.ptr;
139         __m128i lutmsk = gradient.lookupLength - 1;
140         __m128i lutmsk2 = gradient.lookupLength*2 - 1;
141         __m128 lutscale = gradient.lookupLength * numRepeats;
142 
143         // XMM constants
144 
145         immutable __m128i XMZERO = 0;
146         immutable __m128i XMFFFF = 0xFFFFFFFF;
147 
148         // paint variables
149 
150         float t0 = (bpos*4-xctr)*xstep0 + (y-yctr)*ystep0;
151         __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep0), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
152         xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0));
153         __m128 xmStep0 = _mm_set1_ps(xstep0*4);
154 
155         float t1 = (bpos*4-xctr)*xstep1 + (y-yctr)*ystep1;
156         __m128 xmT1 = _mm_mul_ps(_mm_set1_ps(xstep1), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
157         xmT1 = _mm_add_ps(xmT1, _mm_set1_ps(t1));
158         __m128 xmStep1 = _mm_set1_ps(xstep1*4);
159 
160         // main loop
161 
162         while (bpos < endbit)
163         {
164             int nsb = nextSetBit(mask, bpos, endbit);
165 
166             // do we have a span of unchanging coverage?
167 
168             if (bpos < nsb)
169             {
170                 // Calc coverage of first pixel
171 
172                 int cover = calcCoverage!wr(xmWinding[3]+delta[bpos*4]);
173 
174                 // We can skip the span
175 
176                 if (cover < 0x100)
177                 {
178                     __m128 xskip = _mm_set1_ps(nsb-bpos);
179                     xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(xskip,xmStep0));
180                     xmT1 = _mm_add_ps(xmT1, _mm_mul_ps(xskip,xmStep1));
181                     bpos = nsb;
182                 }
183 
184                 // Or fill span with soid color
185 
186                 else if (gradient.isOpaque && (cover > 0xFF00))
187                 {
188                     uint* ptr = &dest[bpos*4];
189                     uint* end = ptr + ((nsb-bpos)*4);
190 
191                     while (ptr < end)
192                     {
193                         __m128 grad = gradOfSorts(xmT0,xmT1);
194                         __m128 poly = polyAprox(grad);
195                         poly = fixupQuadrant(poly,xmT0,xmT1)*lutscale;
196                         __m128i ipos = _mm_cvtps_epi32(poly);
197 
198                         xmT0 = xmT0 + xmStep0;
199                         xmT1 = xmT1 + xmStep1;
200 
201                         ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2);
202 
203                         ptr[0] = lut[ipos.array[0]];
204                         ptr[1] = lut[ipos.array[1]];
205                         ptr[2] = lut[ipos.array[2]];
206                         ptr[3] = lut[ipos.array[3]];
207 
208                         ptr+=4;                        
209                     }
210 
211                     bpos = nsb;
212                 }
213 
214                 // Or fill span with transparent color
215 
216                 else
217                 {
218                     __m128i xmcover = _mm_set1_epi16 (cast(ushort) cover);
219 
220                     uint* ptr = &dest[bpos*4];
221                     uint* end = &dest[nsb*4];
222 
223                     while (ptr < end)
224                     {
225                         __m128 grad = gradOfSorts(xmT0,xmT1);
226 
227                         // load destination pixels
228 
229                         __m128i d0 = _mm_load_si128(cast(__m128i*)ptr);
230                         __m128i d1 = _mm_unpackhi_epi8(d0,d0);
231                         d0 = _mm_unpacklo_epi8(d0,d0);
232 
233                         // evauluate angle
234 
235                         __m128 poly = polyAprox(grad);
236                         poly = fixupQuadrant(poly,xmT0,xmT1)*lutscale;
237                         __m128i ipos = _mm_cvtps_epi32(poly);
238 
239                         xmT0 = xmT0 + xmStep0;
240                         xmT1 = xmT1 + xmStep1;
241 
242                         ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2);
243 
244                         // load grad colours and alpha
245 
246                         __m128i c0 = _mm_loadu_si32 (&lut[ipos.array[0]]);
247                         __m128i tmpc0 = _mm_loadu_si32 (&lut[ipos.array[1]]);
248                         c0 = _mm_unpacklo_epi32 (c0, tmpc0);
249                         c0 = _mm_unpacklo_epi8 (c0, c0);
250 
251                         __m128i a0 = _mm_mulhi_epu16(c0,xmcover);
252                        
253                         __m128i c1 = _mm_loadu_si32 (&lut[ipos.array[2]]);
254                         __m128i tmpc1 = _mm_loadu_si32 (&lut[ipos.array[3]]);
255                         c1 = _mm_unpacklo_epi32 (c1, tmpc1);
256                         c1 = _mm_unpacklo_epi8 (c1, c1);
257 
258                         __m128i a1 = _mm_mulhi_epu16(c1,xmcover);
259 
260                         // unpack alpha
261 
262                         a0 = _mm_shufflelo_epi16!255(a0);
263                         a0 = _mm_shufflehi_epi16!255(a0);
264                         a1 = _mm_shufflelo_epi16!255(a1);
265                         a1 = _mm_shufflehi_epi16!255(a1);
266 
267                        // alpha*source + dest - alpha*dest
268 
269                         c0 = _mm_mulhi_epu16 (c0,a0);
270                         c1 = _mm_mulhi_epu16 (c1,a1);
271                         c0 = _mm_add_epi16 (c0,d0);
272                         c1 = _mm_add_epi16 (c1,d1);
273                         d0 = _mm_mulhi_epu16 (d0,a0);
274                         d1 = _mm_mulhi_epu16 (d1,a1);
275                         c0 =  _mm_sub_epi16 (c0,d0);
276                         c1 =  _mm_sub_epi16 (c1,d1);
277                         c0 = _mm_srli_epi16 (c0,8);
278                         c1 = _mm_srli_epi16 (c1,8);
279 
280                         d0 = _mm_packus_epi16 (c0,c1);
281 
282                         _mm_store_si128 (cast(__m128i*)ptr,d0);
283                         
284                         ptr+=4;
285                     }
286 
287                     bpos = nsb;
288                 }
289             }
290 
291             // At this point we need to integrate scandelta
292 
293             uint* ptr = &dest[bpos*4];
294             uint* end = &dest[endbit*4];
295             int* dlptr = &delta[bpos*4];
296 
297             while (bpos < endbit)
298             {
299                 __m128 grad = gradOfSorts(xmT0,xmT1);
300 
301                 // Integrate delta values
302 
303                 __m128i idv = _mm_load_si128(cast(__m128i*)dlptr);
304                 idv = _mm_add_epi32(idv, _mm_slli_si128!4(idv)); 
305                 idv = _mm_add_epi32(idv, _mm_slli_si128!8(idv)); 
306                 idv = _mm_add_epi32(idv, xmWinding); 
307                 xmWinding = _mm_shuffle_epi32!255(idv);  
308                 _mm_store_si128(cast(__m128i*)dlptr,XMZERO);
309 
310                 // eval angle
311 
312                 __m128 poly = polyAprox(grad);
313                 poly = fixupQuadrant(poly,xmT0,xmT1)*lutscale;
314 
315                 // calculate coverage from winding
316 
317                 __m128i xmcover = calcCoverage32!wr(idv);
318 
319                 // convert grad pos to integer
320 
321                 __m128i ipos = _mm_cvtps_epi32(poly);
322 
323                 // Load destination pixels
324 
325                 __m128i d0 = _mm_load_si128(cast(__m128i*)ptr);
326                 __m128i d1 = _mm_unpackhi_epi8(d0,d0);
327                 d0 = _mm_unpacklo_epi8(d0,d0);
328 
329                 xmT0 = xmT0 + xmStep0;
330                 xmT1 = xmT1 + xmStep1;
331 
332                 ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2);
333 
334                 // load grad colors
335 
336                 __m128i c0 = _mm_loadu_si32 (&lut[ipos.array[0]]);
337                 __m128i tmpc0 = _mm_loadu_si32 (&lut[ipos.array[1]]);
338                 c0 = _mm_unpacklo_epi32 (c0, tmpc0);
339                 c0 = _mm_unpacklo_epi8 (c0, c0);
340 
341                 __m128i a0 = _mm_unpacklo_epi32(xmcover,xmcover);
342                 a0 = _mm_mulhi_epu16(a0, c0);
343 
344                 __m128i c1 = _mm_loadu_si32 (&lut[ipos.array[2]]);
345                 __m128i tmpc1 = _mm_loadu_si32 (&lut[ipos.array[3]]);
346                 c1 = _mm_unpacklo_epi32 (c1, tmpc1);
347                 c1 = _mm_unpacklo_epi8 (c1, c1);
348 
349                 __m128i a1 = _mm_unpackhi_epi32(xmcover,xmcover);
350                 a1 = _mm_mulhi_epu16(a1, c1);
351 
352                 // unpack alpha
353 
354                 a0 = _mm_shufflelo_epi16!255(a0);
355                 a0 = _mm_shufflehi_epi16!255(a0);
356                 a1 = _mm_shufflelo_epi16!255(a1);
357                 a1 = _mm_shufflehi_epi16!255(a1);
358 
359                 // alpha*source + dest - alpha*dest
360 
361                 c0 = _mm_mulhi_epu16 (c0,a0);
362                 c1 = _mm_mulhi_epu16 (c1,a1);
363                 c0 = _mm_add_epi16 (c0,d0);
364                 c1 = _mm_add_epi16 (c1,d1);
365                 d0 = _mm_mulhi_epu16 (d0,a0);
366                 d1 = _mm_mulhi_epu16 (d1,a1);
367                 c0 =  _mm_sub_epi16 (c0, d0);
368                 c1 =  _mm_sub_epi16 (c1, d1);
369                 c0 = _mm_srli_epi16 (c0,8);
370                 c1 = _mm_srli_epi16 (c1,8);
371 
372                 d0 = _mm_packus_epi16 (c0,c1);
373 
374                 _mm_store_si128 (cast(__m128i*)ptr,d0);
375                 
376                 bpos++;
377                 ptr+=4;
378                 dlptr+=4;
379 
380                 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0)  break;
381             }
382         }
383     }
384 
385     // Member variables
386 
387     uint* pixels;
388     int stride;
389     int height;
390     float xctr,yctr;
391     float xstep0,ystep0;
392     float xstep1,ystep1; 
393     Gradient gradient;
394     WindingRule windingRule;
395     RepeatMode repeatMode;
396     float numRepeats;
397 }
398 
399 /*
400    helpers for fast atan2
401    these should be inlined by ldc
402    split up into 3 seperate parts because its faster to spread them out
403    in the calling code. Breaks up the instruction dependency somewhat.
404 */
405 
406 private:
407 
408 immutable __m128 MINSUM = 0.001;
409 immutable __m128 FQTWO = 0.5;
410 
411 __m128 gradOfSorts(__m128 x, __m128 y)
412 {
413     __m128 absx = _mm_and_ps(x, cast(__m128) XMABSMASK);
414     __m128 absy = _mm_and_ps(y, cast(__m128) XMABSMASK);
415     __m128 sum = _mm_add_ps(absx,absy);
416     __m128 diff = _mm_sub_ps(absx,absy);
417     sum = _mm_max_ps(sum,MINSUM);
418     return diff / sum;
419 }
420 
421 immutable __m128 PCOEF0  = 0.125f;
422 immutable __m128 PCOEF1  = 0.154761366f;
423 immutable __m128 PCOEF3  = 0.0305494905f;
424 
425 __m128 polyAprox(__m128 g)
426 {
427     __m128 sqr = g*g;
428     __m128 p3 = PCOEF3*g;
429     __m128 p1 = PCOEF1*g;
430     return PCOEF0 - p1 + p3*sqr;
431 }
432 
433 // lots of casts here due to mixing of int4 and float4
434 
435 __m128 fixupQuadrant(__m128 pos, __m128 t0, __m128 t1)
436 {
437     pos = cast(__m128) (cast(__m128i) pos ^ ((cast(__m128i) t0 ^ cast(__m128i) t1) & XMSIGNMASK));
438     return pos + cast(__m128) (_mm_srai_epi32(cast(__m128i)t0,31) & cast(__m128i) FQTWO);
439 }