1 /**
2   Blitter for painting biradial gradients.
3 
4   Copyright Chris Jones 2020.
5   Distributed under the Boost Software License, Version 1.0.
6   See accompanying file Licence.txt or copy at...
7   https://www.boost.org/LICENSE_1_0.txt
8 */
9 
10 module dg2d.biradialblit;
11 
12 import dg2d.rasterizer;
13 import dg2d.gradient;
14 import dg2d.misc;
15 import dg2d.blitex;
16 
17 /**
18    Biradial gradient blitter struct.
19 
20    ---
21    auto blit = RadialBlit(m_pixels,m_stride,m_height);
22    blit.setPaint(grad, wr, RepeatMode.Pad);
23    blit.setCircles(x0,y0,x1,y1,x2,y2);
24    m_rasterizer.rasterize(blit.getBlitFunc);
25    ---
26 */
27 
28 struct BiradialBlit
29 {
30     /** Construct a Biradial blitter.
31     pixels - pointer to a 32 bpp pixel buffer
32     stride - buffer width in pixels
33     height - buffer heigth in pixels
34 
35     note: buffer must be 16 byte aligned, stride must be multiple of 4
36     */
37 
38     this(uint* pixels, int stride, int height)
39     {
40         assert(((cast(uint)pixels) & 15) == 0); // must be 16 byte aligned
41         assert((stride & 3) == 0);              // stride must be 16 byte aligned
42         assert(height > 0);
43         this.pixels = pixels;
44         this.stride = stride;
45         this.height = height;
46     }
47 
48     /** 
49       Set the paint options.
50       Params:
51       gradient = colour gradient to use
52       wrule  = winding rule
53       rmode  = repeat mode, Pad, Repeat or Mirror modes are supported.
54       
55       Notes: If the focus circle is not fully enclosed by the main circle there will be
56         areas that are undefined in terms of where they map to on the gradient
57         axis. These areas are filled with end colour from the gradient.
58     */
59 
60     void setPaint(Gradient gradient, WindingRule wrule, RepeatMode rmode)
61     {
62         assert(gradient !is null);
63         assert(isPow2(gradient.lookupLength));
64         this.gradient = gradient;
65         this.windingRule = wrule;
66         this.repeatMode = rmode;
67     }
68 
69     /**
70       Set the focus and main circles.
71     */
72 
73     void setCoords(float x0, float y0, float r0, float y1, float x1, float r1)
74     {
75         dx = x1-x0;
76         dy = y1-y0;
77         dr = r1-r0;
78         fx = x0; // note use fx,fy as focus x,y to avoid clashing with x0 in blit method
79         fy = y0;
80         fr = r0;
81         isEnclosed = (sqr(dx)+sqr(dy)) < sqr(dr);
82     }
83 
84     /** Get the BlitFunc for use by the rasterizer */
85 
86     BlitFunc getBlitFunc() return
87     {
88         if (isEnclosed)
89         {
90             if (windingRule == WindingRule.NonZero)
91             {
92                 switch(repeatMode)
93                 {
94                     case RepeatMode.Pad: return &biradial_blit!(WindingRule.NonZero,RepeatMode.Pad,true);
95                     case RepeatMode.Repeat: return &biradial_blit!(WindingRule.NonZero,RepeatMode.Repeat,true);
96                     case RepeatMode.Mirror: return &biradial_blit!(WindingRule.NonZero,RepeatMode.Mirror,true);
97                     default: assert(0);
98                 }
99             }
100             else
101             {
102                 switch(repeatMode)
103                 {
104                     case RepeatMode.Pad: return &biradial_blit!(WindingRule.EvenOdd,RepeatMode.Pad,true);
105                     case RepeatMode.Repeat: return &biradial_blit!(WindingRule.EvenOdd,RepeatMode.Repeat,true);
106                     case RepeatMode.Mirror: return &biradial_blit!(WindingRule.EvenOdd,RepeatMode.Mirror,true);
107                     default: assert(0);
108                 }
109             }
110         }
111         else
112         {
113             if (windingRule == WindingRule.NonZero)
114             {
115                 switch(repeatMode)
116                 {
117                     case RepeatMode.Pad: return &biradial_blit!(WindingRule.NonZero,RepeatMode.Pad,false);
118                     case RepeatMode.Repeat: return &biradial_blit!(WindingRule.NonZero,RepeatMode.Repeat,false);
119                     case RepeatMode.Mirror: return &biradial_blit!(WindingRule.NonZero,RepeatMode.Mirror,false);
120                     default: assert(0);
121                 }
122             }
123             else
124             {
125                 switch(repeatMode)
126                 {
127                     case RepeatMode.Pad: return &biradial_blit!(WindingRule.EvenOdd,RepeatMode.Pad,false);
128                     case RepeatMode.Repeat: return &biradial_blit!(WindingRule.EvenOdd,RepeatMode.Repeat,false);
129                     case RepeatMode.Mirror: return &biradial_blit!(WindingRule.EvenOdd,RepeatMode.Mirror,false);
130                     default: assert(0);
131                 }
132             }
133         }
134     }
135 
136 private:
137 
138     void biradial_blit(WindingRule wr, RepeatMode mode, bool isEnclosed)
139                     (int* delta, DMWord* mask, int x0, int x1, int y)
140     {
141         assert(x0 >= 0);
142         assert(x1 <= stride);
143         assert(y >= 0);
144         assert(y < height);
145         assert((x0 & 3) == 0);
146         assert((x1 & 3) == 0);
147 
148         // main blit variables
149 
150         int bpos = x0 / 4;
151         int endbit = x1 / 4;
152         uint* dest = &pixels[y*stride];
153         __m128i xmWinding = 0;
154         uint* lut = gradient.getLookup.ptr;
155         __m128i lutmsk = gradient.lookupLength - 1;
156         __m128i lutmsk2 = gradient.lookupLength*2 - 1;
157         __m128 xmgradlen = _mm_set1_ps(gradient.lookupLength);
158 
159         // XMM constants
160 
161         immutable __m128i XMZERO = 0;
162 
163         // paint variables
164 
165         // note that variable names have changed from doc in the notes folder...
166         // fx,fy,fr are the focus circle instead of (x0,y0,r0).
167         // (x0,y) and the point of interest instead of (x,y)
168 
169         float coefA = sqr(dx) + sqr(dy) - sqr(dr);
170         float coefB = 2*fx*dx - 2*dx*x0 + 2*dy*fy - 2*dy*y - 2*fr*dr;
171         float coefC = sqr(y) + sqr(fx) + sqr(fy) - 2*fy*y - sqr(fr);
172 
173         __m128 xmstepB = _mm_set1_ps(-2*dx*4);
174         __m128 xmseqx = _mm_setr_ps(0.0f,1.0f,2.0f,3.0f);
175         __m128 xmposx = _mm_set1_ps(x0) + xmseqx; 
176         __m128 xmstepx = _mm_set1_ps(4.0f);
177         __m128 xm2x0 = _mm_set1_ps(2*fx);
178         __m128 xmcoefB = _mm_set1_ps(coefB) + xmseqx * _mm_set1_ps(-2*dx);
179         __m128 xmcoefC = _mm_set1_ps(coefC);
180 
181         __m128 xmq0 = _mm_set1_ps(-0.5/coefA);
182         __m128 xmq1 = _mm_set1_ps(4*coefA);
183 
184         // main loop 
185 
186         while (bpos < endbit)
187         {
188             int nsb = nextSetBit(mask, bpos, endbit);
189 
190             // do we have a span of unchanging coverage?
191 
192             if (bpos < nsb)
193             {
194                 // Calc coverage of first pixel
195 
196                 int cover = calcCoverage!wr(xmWinding[3]+delta[bpos*4]);
197 
198                 // We can skip the span
199 
200                 if (cover < 0x100)
201                 {
202                     __m128 xskip = _mm_set1_ps(nsb-bpos);
203                     xmcoefB = xmcoefB + _mm_mul_ps(xskip,xmstepB);
204                     xmposx = xmposx + _mm_mul_ps(xskip,xmstepx);
205                     bpos = nsb;
206                 }
207 
208                 // Or fill span with soid color
209 
210                 else if (gradient.isOpaque && (cover > 0xFF00))
211                 {
212                     uint* ptr = &dest[bpos*4];
213                     uint* end = ptr + ((nsb-bpos)*4);
214 
215                     while (ptr < end)
216                     {
217                         __m128 xmc = xmposx*(xmposx-xm2x0) + xmcoefC;
218                         __m128 xmdiscr = xmcoefB*xmcoefB - xmq1*xmc;
219                         __m128 xmsqrtd = _mm_sqrt_ps(xmdiscr);
220                         __m128 xmt = xmq0 * (xmsqrtd + xmcoefB);
221 
222                         // Generate pad mask if needed, used to control the colour in
223                         // undefined /out of bounds areas
224 
225                         static if (isEnclosed == false)
226                         {
227                             __m128 xmt2 = (xmsqrtd - xmcoefB);
228                             __m128i padMask = _mm_or_si128(cast(__m128i) xmdiscr, cast(__m128i) xmt2);
229                             padMask = _mm_srai_epi32(padMask,31);
230                         }
231 
232                         xmcoefB += xmstepB;
233                         xmposx += xmstepx;
234 
235                         __m128i ipos = _mm_cvtps_epi32 (xmt * xmgradlen);
236                         ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2);
237 
238                         // set ipos to max for undefined areas
239 
240                         static if (isEnclosed == false)
241                         {
242                             ipos = _mm_or_si128(ipos, padMask & lutmsk);
243                         }
244 
245                         __m128i tmp;
246                         tmp[0] = lut[ipos.array[0]];
247                         tmp[1] = lut[ipos.array[1]];
248                         tmp[2] = lut[ipos.array[2]];
249                         tmp[3] = lut[ipos.array[3]];
250 
251                         _mm_store_si128 (cast(__m128i*)ptr,tmp);
252 
253                         ptr+=4;                        
254                     }
255 
256                     bpos = nsb;
257                 }
258 
259                 // Or fill span with transparent color
260 
261                 else
262                 {
263                     __m128i xmcover = _mm_set1_epi16 (cast(ushort) cover);
264 
265                     uint* ptr = &dest[bpos*4];
266                     uint* end = &dest[nsb*4];
267 
268                     while (ptr < end)
269                     {
270 
271                         __m128 xmc = xmposx*(xmposx-xm2x0) + xmcoefC;
272                         __m128 xmdiscr = xmcoefB*xmcoefB - xmq1*xmc;
273                         __m128 xmsqrtd = _mm_sqrt_ps(xmdiscr);
274                         __m128 xmt = xmq0 * (xmsqrtd + xmcoefB);
275 
276                         // Generate pad mask if needed, used to control the colour in
277                         // any undefined areas
278 
279                         static if (isEnclosed == false)
280                         {
281                             __m128 xmt2 = (xmsqrtd - xmcoefB);
282                             __m128i padMask = _mm_or_si128(cast(__m128i) xmdiscr, cast(__m128i) xmt2);
283                             padMask = _mm_srai_epi32(padMask,31);
284                         }
285 
286                         xmcoefB += xmstepB;
287                         xmposx += xmstepx;
288 
289                         __m128i ipos = _mm_cvtps_epi32 (xmt * xmgradlen);
290                         ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2);
291 
292                         // set ipos to max for undefined areas
293 
294                         static if (isEnclosed == false)
295                         {
296                             ipos = _mm_or_si128(ipos, padMask & lutmsk);
297                         }
298 
299                         // load destination pixels
300 
301                         __m128i d0 = _mm_load_si128(cast(__m128i*)ptr);
302                         __m128i d1 = _mm_unpackhi_epi8(d0,d0);
303                         d0 = _mm_unpacklo_epi8(d0,d0);
304 
305                         // load grad colours and alpha
306                  
307                         __m128i c0 = _mm_loadu_si32 (&lut[ipos.array[0]]);
308                         __m128i tmpc0 = _mm_loadu_si32 (&lut[ipos.array[1]]);
309                         c0 = _mm_unpacklo_epi32 (c0, tmpc0);
310                         c0 = _mm_unpacklo_epi8 (c0, c0);
311 
312                         __m128i a0 = _mm_mulhi_epu16(c0,xmcover);
313                        
314                         __m128i c1 = _mm_loadu_si32 (&lut[ipos.array[2]]);
315                         __m128i tmpc1 = _mm_loadu_si32 (&lut[ipos.array[3]]);
316                         c1 = _mm_unpacklo_epi32 (c1, tmpc1);
317                         c1 = _mm_unpacklo_epi8 (c1, c1);
318 
319                         __m128i a1 = _mm_mulhi_epu16(c1,xmcover);
320 
321                         // unpack alpha
322 
323                         a0 = _mm_shufflelo_epi16!255(a0);
324                         a0 = _mm_shufflehi_epi16!255(a0);
325                         a1 = _mm_shufflelo_epi16!255(a1);
326                         a1 = _mm_shufflehi_epi16!255(a1);
327 
328                        // alpha*source + dest - alpha*dest
329 
330                         c0 = _mm_mulhi_epu16 (c0,a0);
331                         c1 = _mm_mulhi_epu16 (c1,a1);
332                         c0 = _mm_add_epi16 (c0,d0);
333                         c1 = _mm_add_epi16 (c1,d1);
334                         d0 = _mm_mulhi_epu16 (d0,a0);
335                         d1 = _mm_mulhi_epu16 (d1,a1);
336                         c0 =  _mm_sub_epi16 (c0,d0);
337                         c1 =  _mm_sub_epi16 (c1,d1);
338                         c0 = _mm_srli_epi16 (c0,8);
339                         c1 = _mm_srli_epi16 (c1,8);
340 
341                         d0 = _mm_packus_epi16 (c0,c1);
342 
343                         _mm_store_si128 (cast(__m128i*)ptr,d0);
344                         
345                         ptr+=4;
346                     }
347 
348                     bpos = nsb;
349                 }
350             }
351 
352             // At this point we need to integrate scandelta
353 
354             uint* ptr = &dest[bpos*4];
355             uint* end = &dest[endbit*4];
356             int* dlptr = &delta[bpos*4];
357 
358             while (bpos < endbit)
359             {
360                 __m128 xmc = xmposx*(xmposx-xm2x0) + xmcoefC;
361                 __m128 xmdiscr = xmcoefB*xmcoefB - xmq1*xmc;
362                 __m128 xmsqrtd = _mm_sqrt_ps(xmdiscr);
363                 __m128 xmt = xmq0 * (xmsqrtd + xmcoefB);
364 
365                 // Generate pad mask if needed, used to control the colour in
366                 // any undefined areas
367 
368                 static if (isEnclosed == false)
369                 {
370                     __m128 xmt2 = (xmsqrtd - xmcoefB);
371                     __m128i padMask = _mm_or_si128(cast(__m128i) xmdiscr, cast(__m128i) xmt2);
372                     padMask = _mm_srai_epi32(padMask,31);
373                 }
374 
375                 xmcoefB += xmstepB;
376                 xmposx += xmstepx;
377 
378                 __m128i ipos = _mm_cvtps_epi32 (xmt * xmgradlen);
379                 ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2);
380                 
381                 // set ipos to max for undefined areas
382 
383                 static if (isEnclosed == false)
384                 {
385                     ipos = _mm_or_si128(ipos, padMask & lutmsk);
386                 }
387 
388                 // Integrate delta values
389 
390                 __m128i idv = _mm_load_si128(cast(__m128i*)dlptr);
391                 idv = _mm_add_epi32(idv, _mm_slli_si128!4(idv)); 
392                 idv = _mm_add_epi32(idv, _mm_slli_si128!8(idv)); 
393                 idv = _mm_add_epi32(idv, xmWinding); 
394                 xmWinding = _mm_shuffle_epi32!255(idv);  
395                 _mm_store_si128(cast(__m128i*)dlptr,XMZERO);
396 
397                 // calculate coverage from winding
398 
399                 __m128i xmcover = calcCoverage32!wr(idv);
400 
401                 // Load destination pixels
402 
403                 __m128i d0 = _mm_load_si128(cast(__m128i*)ptr);
404                 __m128i d1 = _mm_unpackhi_epi8(d0,d0);
405                 d0 = _mm_unpacklo_epi8(d0,d0);
406 
407                 // load grad colors
408 
409                 __m128i c0 = _mm_loadu_si32 (&lut[ipos.array[0]]);
410                 __m128i tmpc0 = _mm_loadu_si32 (&lut[ipos.array[1]]);
411                 c0 = _mm_unpacklo_epi32 (c0, tmpc0);
412                 c0 = _mm_unpacklo_epi8 (c0, c0);
413 
414                 __m128i a0 = _mm_unpacklo_epi32(xmcover,xmcover);
415                 a0 = _mm_mulhi_epu16(a0, c0);
416 
417                 __m128i c1 = _mm_loadu_si32 (&lut[ipos.array[2]]);
418                 __m128i tmpc1 = _mm_loadu_si32 (&lut[ipos.array[3]]);
419                 c1 = _mm_unpacklo_epi32 (c1, tmpc1);
420                 c1 = _mm_unpacklo_epi8 (c1, c1);
421 
422                 __m128i a1 = _mm_unpackhi_epi32(xmcover,xmcover);
423                 a1 = _mm_mulhi_epu16(a1, c1);
424 
425                 // unpack alpha
426 
427                 a0 = _mm_shufflelo_epi16!255(a0);
428                 a0 = _mm_shufflehi_epi16!255(a0);
429                 a1 = _mm_shufflelo_epi16!255(a1);
430                 a1 = _mm_shufflehi_epi16!255(a1);
431 
432                 // alpha*source + dest - alpha*dest
433 
434                 c0 = _mm_mulhi_epu16 (c0,a0);
435                 c1 = _mm_mulhi_epu16 (c1,a1);
436                 c0 = _mm_add_epi16 (c0,d0);
437                 c1 = _mm_add_epi16 (c1,d1);
438                 d0 = _mm_mulhi_epu16 (d0,a0);
439                 d1 = _mm_mulhi_epu16 (d1,a1);
440                 c0 =  _mm_sub_epi16 (c0, d0);
441                 c1 =  _mm_sub_epi16 (c1, d1);
442                 c0 = _mm_srli_epi16 (c0,8);
443                 c1 = _mm_srli_epi16 (c1,8);
444 
445                 d0 = _mm_packus_epi16 (c0,c1);
446 
447                 _mm_store_si128 (cast(__m128i*)ptr,d0);
448                 
449                 bpos++;
450                 ptr+=4;
451                 dlptr+=4;
452 
453                 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0)  break;
454             }
455         }
456     }
457 
458 private:
459 
460     uint* pixels;
461     int stride;
462     int height;
463     float fx,fy,fr; 
464     float dx,dy,dr;
465     Gradient gradient;
466     WindingRule windingRule;
467     RepeatMode repeatMode;
468     bool isEnclosed;
469 }
470