1 /**
2   Blitter for painting linear gradients.
3 
4   Copyright Chris Jones 2020.
5   Distributed under the Boost Software License, Version 1.0.
6   See accompanying file Licence.txt or copy at...
7   https://www.boost.org/LICENSE_1_0.txt
8 */
9 
10 module dg2d.linearblit;
11 
12 import dg2d.rasterizer;
13 import dg2d.gradient;
14 import dg2d.misc;
15 import dg2d.blitex;
16 
17 /*
18    Linear gradient blitter struct.
19 
20    You set up the properties and pass the BlitFunc to the rasterizer.
21 
22    ---
23    auto ablit = AngularBlit(m_pixels,m_stride,m_height);
24    ablit.setPaint(grad, wr, RepeatMode.Mirror, 4.0f);
25    ablit.setElipse(x0,y0,x1,y1,x2,y2);
26    m_rasterizer.rasterize(ablit.getBlitFunc);
27    ---
28 */
29 
30 
31 struct LinearBlit
32 {  
33     /** Construct an linear gradient blitter.
34     pixels - pointer to a 32 bpp pixel buffer
35     stride - buffer width in pixels
36     height - buffer heigth in pixels
37 
38     note: buffer must be 16 byte aligned, stride must be multiple of 4
39     */
40 
41     this(uint* pixels, int stride, int height)
42     {
43         assert(((cast(uint)pixels) & 15) == 0); // must be 16 byte aligned
44         assert((stride & 3) == 0);              // stride must be 16 byte aligned
45         assert(height > 0);
46         this.pixels = pixels;
47         this.stride = stride;
48         this.height = height;
49     }
50 
51     /** set the gradient, winding rule and repeat mode.
52     */
53 
54     void setPaint(Gradient grad, WindingRule wrule, RepeatMode rmode)
55     {
56         assert(grad !is null);
57         assert(isPow2(grad.lookupLength));
58         gradient = grad;
59         windingRule = wrule;
60         repeatMode = rmode;
61     }
62 
63     /** Set the coordinates for the start and end point of the linear gradient.
64     */
65 
66     void setCoords(float x0, float y0, float x1, float y1)
67     {
68         xctr = x0;
69         yctr = y0;
70         float w = x1-x0;
71         float h = y1-y0;
72         float hsq = w*w + h*h;
73         if (hsq < 0.1) hsq = 0.1; // avoid div by zero
74         xstep = gradient.lookupLength * w / hsq; 
75         ystep = gradient.lookupLength * h / hsq;
76     }
77 
78     /** returns a BlitFunc for use by the rasterizer */
79 
80     BlitFunc getBlitFunc() return
81     {
82         if (windingRule == WindingRule.NonZero)
83         {
84             switch(repeatMode)
85             {
86                 case RepeatMode.Pad: return &linear_blit!(WindingRule.NonZero,RepeatMode.Pad);
87                 case RepeatMode.Repeat: return &linear_blit!(WindingRule.NonZero,RepeatMode.Repeat);
88                 case RepeatMode.Mirror: return &linear_blit!(WindingRule.NonZero,RepeatMode.Mirror);
89                 default: assert(0);
90             }
91         }
92         else
93         {
94             switch(repeatMode)
95             {
96                 case RepeatMode.Pad: return &linear_blit!(WindingRule.EvenOdd,RepeatMode.Pad);
97                 case RepeatMode.Repeat: return &linear_blit!(WindingRule.EvenOdd,RepeatMode.Repeat);
98                 case RepeatMode.Mirror: return &linear_blit!(WindingRule.EvenOdd,RepeatMode.Mirror);
99                 default: assert(0);
100             }
101         }
102     }
103 
104 private:
105 
106     void linear_blit(WindingRule wr, RepeatMode mode)(int* delta, DMWord* mask, int x0, int x1, int y)
107     {
108         assert( ( cast(size_t)delta & 15 ) == 0 );
109         assert(x0 >= 0);
110         assert(x1 <= stride);
111         assert(y >= 0);
112         assert(y < height);
113         assert((x0 & 3) == 0);
114         assert((x1 & 3) == 0);
115 
116         // main blit variables
117 
118         int bpos = x0 / 4;
119         int endbit = x1 / 4;
120         uint* dest = &pixels[y*stride];
121         __m128i xmWinding = 0;
122         uint* lut = gradient.getLookup.ptr;
123         __m128i lutmsk = gradient.lookupLength - 1;
124         __m128i lutmsk2 = gradient.lookupLength*2 - 1;
125 
126         // XMM constants
127 
128         immutable __m128i XMZERO = 0;
129 
130         // paint variables
131 
132         float t0 = (bpos*4-xctr)*xstep + (y-yctr)*ystep;
133         __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
134         xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0));
135         __m128 xmStep0 = _mm_set1_ps(xstep*4);
136 
137         // main loop
138 
139         while (bpos < endbit)
140         {
141             int nsb = nextSetBit(mask, bpos, endbit);
142 
143             // do we have a span of unchanging coverage?
144 
145             if (bpos < nsb)
146             {
147                 // Calc coverage of first pixel
148 
149                 int cover = calcCoverage!wr(xmWinding[3]+delta[bpos*4]);
150 
151                 // We can skip the span
152 
153                 if (cover < 0x100)
154                 {
155                     __m128 xskip = _mm_set1_ps(nsb-bpos);
156                     xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(xskip,xmStep0));
157                     bpos = nsb;
158                 }
159 
160                 // Or fill span with soid color
161 
162                 else if (gradient.isOpaque && (cover > 0xFF00))
163                 {
164                     uint* ptr = &dest[bpos*4];
165                     uint* end = ptr + ((nsb-bpos)*4);
166 
167                     while (ptr < end)
168                     {
169                         __m128i ipos = _mm_cvtps_epi32 (xmT0);
170                         xmT0 = xmT0 + xmStep0;
171 
172                         ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2);
173 
174                         ptr[0] = lut[ipos.array[0]];
175                         ptr[1] = lut[ipos.array[1]];
176                         ptr[2] = lut[ipos.array[2]];
177                         ptr[3] = lut[ipos.array[3]];
178 
179                         ptr+=4;                        
180                     }
181 
182                     bpos = nsb;
183                 }
184 
185                 // Or fill span with transparent color
186 
187                 else
188                 {
189                     __m128i xmcover = _mm_set1_epi16 (cast(ushort) cover);
190 
191                     uint* ptr = &dest[bpos*4];
192                     uint* end = &dest[nsb*4];
193 
194                     while (ptr < end)
195                     {
196                         __m128i ipos = _mm_cvtps_epi32 (xmT0);
197                         xmT0 = xmT0 + xmStep0;
198 
199                         ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2);
200 
201                         // load destinatin pixels
202 
203                         __m128i d0 = _mm_load_si128(cast(__m128i*)ptr);
204                         __m128i d1 = _mm_unpackhi_epi8(d0,d0);
205                         d0 = _mm_unpacklo_epi8(d0,d0);
206 
207                         // load grad colors and alpha
208 
209                         __m128i c0 = _mm_loadu_si32 (&lut[ipos.array[0]]);
210                         __m128i tmpc0 = _mm_loadu_si32 (&lut[ipos.array[1]]);
211                         c0 = _mm_unpacklo_epi32 (c0, tmpc0);
212                         c0 = _mm_unpacklo_epi8 (c0, c0);
213                         
214                         __m128i a0 = _mm_mulhi_epu16(c0,xmcover);
215 
216                         __m128i c1 = _mm_loadu_si32 (&lut[ipos.array[2]]);
217                         __m128i tmpc1 = _mm_loadu_si32 (&lut[ipos.array[3]]);
218                         c1 = _mm_unpacklo_epi32 (c1, tmpc1);
219                         c1 = _mm_unpacklo_epi8 (c1, c1);
220 
221                         __m128i a1 = _mm_mulhi_epu16(c1,xmcover);
222 
223                         // unpack alpha
224 
225                         a0 = _mm_shufflelo_epi16!255(a0);
226                         a0 = _mm_shufflehi_epi16!255(a0);
227                         a1 = _mm_shufflelo_epi16!255(a1);
228                         a1 = _mm_shufflehi_epi16!255(a1);
229 
230                        // alpha*source + dest - alpha*dest
231 
232                         c0 = _mm_mulhi_epu16 (c0,a0);
233                         c1 = _mm_mulhi_epu16 (c1,a1);
234                         c0 = _mm_add_epi16 (c0,d0);
235                         c1 = _mm_add_epi16 (c1,d1);
236                         d0 = _mm_mulhi_epu16 (d0,a0);
237                         d1 = _mm_mulhi_epu16 (d1,a1);
238                         c0 =  _mm_sub_epi16 (c0,d0);
239                         c1 =  _mm_sub_epi16 (c1,d1);
240                         c0 = _mm_srli_epi16 (c0,8);
241                         c1 = _mm_srli_epi16 (c1,8);
242 
243                         d0 = _mm_packus_epi16 (c0,c1);
244 
245                         _mm_store_si128 (cast(__m128i*)ptr,d0);
246                         
247                         ptr+=4;
248                     }
249 
250                     bpos = nsb;
251                 }
252             }
253 
254             // At this point we need to integrate scandelta
255 
256             uint* ptr = &dest[bpos*4];
257             uint* end = &dest[endbit*4];
258             int* dlptr = &delta[bpos*4];
259 
260             while (bpos < endbit)
261             {
262                 // Integrate delta values
263 
264                 __m128i idv = _mm_load_si128(cast(__m128i*)dlptr);
265                 idv = _mm_add_epi32(idv, _mm_slli_si128!4(idv)); 
266                 idv = _mm_add_epi32(idv, _mm_slli_si128!8(idv)); 
267                 idv = _mm_add_epi32(idv, xmWinding); 
268                 xmWinding = _mm_shuffle_epi32!255(idv);  
269                 _mm_store_si128(cast(__m128i*)dlptr,XMZERO);
270 
271                 // calculate coverage from winding
272 
273                 __m128i xmcover = calcCoverage32!wr(idv);
274 
275                 // convert grad pos to integer
276 
277                 __m128i ipos = _mm_cvtps_epi32 (xmT0);
278                 xmT0 = xmT0 + xmStep0;
279 
280                 ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2);
281 
282                 // Load destination pixels
283                
284                 __m128i d0 = _mm_load_si128(cast(__m128i*)ptr);
285                 __m128i d1 = _mm_unpackhi_epi8(d0,d0);
286                 d0 = _mm_unpacklo_epi8(d0,d0);
287 
288                 // load grad colors and alpha
289 
290                 __m128i c0 = _mm_loadu_si32 (&lut[ipos.array[0]]);
291                 __m128i tmpc0 = _mm_loadu_si32 (&lut[ipos.array[1]]);
292                 c0 = _mm_unpacklo_epi32 (c0, tmpc0);
293                 c0 = _mm_unpacklo_epi8 (c0, c0);
294 
295                 __m128i a0 = _mm_unpacklo_epi32(xmcover,xmcover);
296                 a0 = _mm_mulhi_epu16(a0, c0);
297 
298                 __m128i c1 = _mm_loadu_si32 (&lut[ipos.array[2]]);
299                 __m128i tmpc1 = _mm_loadu_si32 (&lut[ipos.array[3]]);
300                 c1 = _mm_unpacklo_epi32 (c1, tmpc1);
301                 c1 = _mm_unpacklo_epi8 (c1, c1);
302 
303                 __m128i a1 = _mm_unpackhi_epi32(xmcover,xmcover);
304                 a1 = _mm_mulhi_epu16(a1, c1);
305 
306                 // unpack alpha
307 
308                 a0 = _mm_shufflelo_epi16!255(a0);
309                 a0 = _mm_shufflehi_epi16!255(a0);
310                 a1 = _mm_shufflelo_epi16!255(a1);
311                 a1 = _mm_shufflehi_epi16!255(a1);
312 
313                 // alpha*source + dest - alpha*dest
314 
315                 c0 = _mm_mulhi_epu16 (c0,a0);
316                 c1 = _mm_mulhi_epu16 (c1,a1);
317                 c0 = _mm_add_epi16 (c0,d0);
318                 c1 = _mm_add_epi16 (c1,d1);
319                 d0 = _mm_mulhi_epu16 (d0,a0);
320                 d1 = _mm_mulhi_epu16 (d1,a1);
321                 c0 =  _mm_sub_epi16 (c0, d0);
322                 c1 =  _mm_sub_epi16 (c1, d1);
323                 c0 = _mm_srli_epi16 (c0,8);
324                 c1 = _mm_srli_epi16 (c1,8);
325 
326                 d0 = _mm_packus_epi16 (c0,c1);
327 
328                 _mm_store_si128 (cast(__m128i*)ptr,d0);
329                 
330                 bpos++;
331                 ptr+=4;
332                 dlptr+=4;
333 
334                 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0)  break;
335             }
336         }
337     }
338 
339     // Member variables
340 
341     uint* pixels;
342     int stride;
343     int height;
344     float xctr,yctr,xstep,ystep;
345     Gradient gradient;
346     WindingRule windingRule;
347     RepeatMode repeatMode;
348 }
349 
350