1 /** 2 Blitter for painting angular gradients. 3 4 Copyright: Chris Jones 5 License: Boost Software License, Version 1.0 6 Authors: Chris Jones 7 */ 8 9 module dg2d.angularblit; 10 11 import dg2d.rasterizer; 12 import dg2d.gradient; 13 import dg2d.misc; 14 import dg2d.blitex; 15 16 /** 17 Angular gradient blitter struct. 18 19 --- 20 auto ablit = AngularBlit(m_pixels,m_stride,m_height); 21 ablit.setPaint(grad, wr, RepeatMode.Mirror, 4.0f); 22 ablit.setElipse(x0,y0,x1,y1,x2,y2); 23 rasterizer.rasterize(ablit.getBlitFunc); 24 --- 25 */ 26 27 struct AngularBlit 28 { 29 /** Construct an Angular blitter. 30 pixels - pointer to a 32 bpp pixel buffer 31 stride - buffer width in pixels 32 height - buffer heigth in pixels 33 34 note: buffer must be 16 byte aligned, stride must be multiple of 4 35 */ 36 37 this(uint* pixels, int stride, int height) 38 { 39 assert(((cast(uint)pixels) & 15) == 0); // must be 16 byte aligned 40 assert((stride & 3) == 0); // stride must be 16 byte aligned 41 assert(height > 0); 42 this.pixels = pixels; 43 this.stride = stride; 44 this.height = height; 45 } 46 47 /** set the gradient, winding rule and repeat mode. "numRepeats" sets how many times 48 the gradient repeats in 360 degrees. 49 */ 50 51 void setPaint(Gradient grad, WindingRule wrule, RepeatMode rmode, float numRepeats) 52 { 53 assert(grad !is null); 54 assert(isPow2(grad.lookupLength)); 55 gradient = grad; 56 windingRule = wrule; 57 repeatMode = rmode; 58 this.numRepeats = numRepeats; 59 } 60 61 /** Specifiy the orientation in terms of an elipse, for that we need 3 points... 62 (x0,y0) is the center of the elipse 63 (x1,y1) is radius at 0 degrees 64 (x2,y2) is radius at 90 degrees 65 The radii dont need to be at right angles, so it can handle elipse that has been 66 though any affine transform. 67 */ 68 69 void setCoords(float x0, float y0, float x1, float y1, float x2, float y2) 70 { 71 xctr = x0; 72 yctr = y0; 73 float w0 = x1-x0; 74 float h0 = y1-y0; 75 float w1 = x2-x0; 76 float h1 = y2-y0; 77 float q = w1*h0 - w0*h1; 78 if (abs(q) < 0.1) q = (q < 0) ? -0.1 : 0.1; 79 xstep0 = -h1 / q; 80 ystep0 = w1 / q; 81 xstep1 = h0 / q; 82 ystep1 = -w0 / q; 83 } 84 85 /** Specifiy the orientation in terms of an circle, for that we need two points, 86 (x0,y0) is the center of the circle 87 (x1,y1) is radius at 0 degrees 88 */ 89 90 void setCoords(float x0, float y0, float x1, float y1) 91 { 92 setCoords(x0,y0,x1,y1,x0-y1+y0,y0+x1-x0); 93 } 94 95 /** returns a BlitFunc for use by the rasterizer */ 96 97 BlitFunc getBlitFunc() return 98 { 99 if (windingRule == WindingRule.NonZero) 100 { 101 switch(repeatMode) 102 { 103 case RepeatMode.Pad: return &angular_blit!(WindingRule.NonZero,RepeatMode.Pad); 104 case RepeatMode.Repeat: return &angular_blit!(WindingRule.NonZero,RepeatMode.Repeat); 105 case RepeatMode.Mirror: return &angular_blit!(WindingRule.NonZero,RepeatMode.Mirror); 106 default: assert(0); 107 } 108 } 109 else 110 { 111 switch(repeatMode) 112 { 113 case RepeatMode.Pad: return &angular_blit!(WindingRule.EvenOdd,RepeatMode.Pad); 114 case RepeatMode.Repeat: return &angular_blit!(WindingRule.EvenOdd,RepeatMode.Repeat); 115 case RepeatMode.Mirror: return &angular_blit!(WindingRule.EvenOdd,RepeatMode.Mirror); 116 default: assert(0); 117 } 118 } 119 } 120 121 private: 122 123 void angular_blit(WindingRule wr, RepeatMode mode)(int* delta, DMWord* mask, int x0, int x1, int y) 124 { 125 assert(x0 >= 0); 126 assert(x1 <= stride); 127 assert(y >= 0); 128 assert(y < height); 129 assert((x0 & 3) == 0); 130 assert((x1 & 3) == 0); 131 132 // main blit variables 133 134 int bpos = x0 / 4; 135 int endbit = x1 / 4; 136 uint* dest = &pixels[y*stride]; 137 __m128i xmWinding = 0; 138 uint* lut = gradient.getLookup.ptr; 139 __m128i lutmsk = gradient.lookupLength - 1; 140 __m128i lutmsk2 = gradient.lookupLength*2 - 1; 141 __m128 lutscale = gradient.lookupLength * numRepeats; 142 143 // XMM constants 144 145 immutable __m128i XMZERO = 0; 146 immutable __m128i XMFFFF = 0xFFFFFFFF; 147 148 // paint variables 149 150 float t0 = (bpos*4-xctr)*xstep0 + (y-yctr)*ystep0; 151 __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep0), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f)); 152 xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0)); 153 __m128 xmStep0 = _mm_set1_ps(xstep0*4); 154 155 float t1 = (bpos*4-xctr)*xstep1 + (y-yctr)*ystep1; 156 __m128 xmT1 = _mm_mul_ps(_mm_set1_ps(xstep1), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f)); 157 xmT1 = _mm_add_ps(xmT1, _mm_set1_ps(t1)); 158 __m128 xmStep1 = _mm_set1_ps(xstep1*4); 159 160 // main loop 161 162 while (bpos < endbit) 163 { 164 int nsb = nextSetBit(mask, bpos, endbit); 165 166 // do we have a span of unchanging coverage? 167 168 if (bpos < nsb) 169 { 170 // Calc coverage of first pixel 171 172 int cover = calcCoverage!wr(xmWinding[3]+delta[bpos*4]); 173 174 // We can skip the span 175 176 if (cover < 0x100) 177 { 178 __m128 xskip = _mm_set1_ps(nsb-bpos); 179 xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(xskip,xmStep0)); 180 xmT1 = _mm_add_ps(xmT1, _mm_mul_ps(xskip,xmStep1)); 181 bpos = nsb; 182 } 183 184 // Or fill span with soid color 185 186 else if (gradient.isOpaque && (cover > 0xFF00)) 187 { 188 uint* ptr = &dest[bpos*4]; 189 uint* end = ptr + ((nsb-bpos)*4); 190 191 while (ptr < end) 192 { 193 __m128 grad = gradOfSorts(xmT0,xmT1); 194 __m128 poly = polyAprox(grad); 195 poly = fixupQuadrant(poly,xmT0,xmT1)*lutscale; 196 __m128i ipos = _mm_cvtps_epi32(poly); 197 198 xmT0 = xmT0 + xmStep0; 199 xmT1 = xmT1 + xmStep1; 200 201 ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2); 202 203 ptr[0] = lut[ipos.array[0]]; 204 ptr[1] = lut[ipos.array[1]]; 205 ptr[2] = lut[ipos.array[2]]; 206 ptr[3] = lut[ipos.array[3]]; 207 208 ptr+=4; 209 } 210 211 bpos = nsb; 212 } 213 214 // Or fill span with transparent color 215 216 else 217 { 218 __m128i xmcover = _mm_set1_epi16 (cast(ushort) cover); 219 220 uint* ptr = &dest[bpos*4]; 221 uint* end = &dest[nsb*4]; 222 223 while (ptr < end) 224 { 225 __m128 grad = gradOfSorts(xmT0,xmT1); 226 227 // load destination pixels 228 229 __m128i d0 = _mm_load_si128(cast(__m128i*)ptr); 230 __m128i d1 = _mm_unpackhi_epi8(d0,d0); 231 d0 = _mm_unpacklo_epi8(d0,d0); 232 233 // evauluate angle 234 235 __m128 poly = polyAprox(grad); 236 poly = fixupQuadrant(poly,xmT0,xmT1)*lutscale; 237 __m128i ipos = _mm_cvtps_epi32(poly); 238 239 xmT0 = xmT0 + xmStep0; 240 xmT1 = xmT1 + xmStep1; 241 242 ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2); 243 244 // load grad colours and alpha 245 246 __m128i c0 = _mm_loadu_si32 (&lut[ipos.array[0]]); 247 __m128i tmpc0 = _mm_loadu_si32 (&lut[ipos.array[1]]); 248 c0 = _mm_unpacklo_epi32 (c0, tmpc0); 249 c0 = _mm_unpacklo_epi8 (c0, c0); 250 251 __m128i a0 = _mm_mulhi_epu16(c0,xmcover); 252 253 __m128i c1 = _mm_loadu_si32 (&lut[ipos.array[2]]); 254 __m128i tmpc1 = _mm_loadu_si32 (&lut[ipos.array[3]]); 255 c1 = _mm_unpacklo_epi32 (c1, tmpc1); 256 c1 = _mm_unpacklo_epi8 (c1, c1); 257 258 __m128i a1 = _mm_mulhi_epu16(c1,xmcover); 259 260 // unpack alpha 261 262 a0 = _mm_shufflelo_epi16!255(a0); 263 a0 = _mm_shufflehi_epi16!255(a0); 264 a1 = _mm_shufflelo_epi16!255(a1); 265 a1 = _mm_shufflehi_epi16!255(a1); 266 267 // alpha*source + dest - alpha*dest 268 269 c0 = _mm_mulhi_epu16 (c0,a0); 270 c1 = _mm_mulhi_epu16 (c1,a1); 271 c0 = _mm_add_epi16 (c0,d0); 272 c1 = _mm_add_epi16 (c1,d1); 273 d0 = _mm_mulhi_epu16 (d0,a0); 274 d1 = _mm_mulhi_epu16 (d1,a1); 275 c0 = _mm_sub_epi16 (c0,d0); 276 c1 = _mm_sub_epi16 (c1,d1); 277 c0 = _mm_srli_epi16 (c0,8); 278 c1 = _mm_srli_epi16 (c1,8); 279 280 d0 = _mm_packus_epi16 (c0,c1); 281 282 _mm_store_si128 (cast(__m128i*)ptr,d0); 283 284 ptr+=4; 285 } 286 287 bpos = nsb; 288 } 289 } 290 291 // At this point we need to integrate scandelta 292 293 uint* ptr = &dest[bpos*4]; 294 uint* end = &dest[endbit*4]; 295 int* dlptr = &delta[bpos*4]; 296 297 while (bpos < endbit) 298 { 299 __m128 grad = gradOfSorts(xmT0,xmT1); 300 301 // Integrate delta values 302 303 __m128i idv = _mm_load_si128(cast(__m128i*)dlptr); 304 idv = _mm_add_epi32(idv, _mm_slli_si128!4(idv)); 305 idv = _mm_add_epi32(idv, _mm_slli_si128!8(idv)); 306 idv = _mm_add_epi32(idv, xmWinding); 307 xmWinding = _mm_shuffle_epi32!255(idv); 308 _mm_store_si128(cast(__m128i*)dlptr,XMZERO); 309 310 // eval angle 311 312 __m128 poly = polyAprox(grad); 313 poly = fixupQuadrant(poly,xmT0,xmT1)*lutscale; 314 315 // calculate coverage from winding 316 317 __m128i xmcover = calcCoverage32!wr(idv); 318 319 // convert grad pos to integer 320 321 __m128i ipos = _mm_cvtps_epi32(poly); 322 323 // Load destination pixels 324 325 __m128i d0 = _mm_load_si128(cast(__m128i*)ptr); 326 __m128i d1 = _mm_unpackhi_epi8(d0,d0); 327 d0 = _mm_unpacklo_epi8(d0,d0); 328 329 xmT0 = xmT0 + xmStep0; 330 xmT1 = xmT1 + xmStep1; 331 332 ipos = calcRepeatModeIDX!mode(ipos, lutmsk, lutmsk2); 333 334 // load grad colors 335 336 __m128i c0 = _mm_loadu_si32 (&lut[ipos.array[0]]); 337 __m128i tmpc0 = _mm_loadu_si32 (&lut[ipos.array[1]]); 338 c0 = _mm_unpacklo_epi32 (c0, tmpc0); 339 c0 = _mm_unpacklo_epi8 (c0, c0); 340 341 __m128i a0 = _mm_unpacklo_epi32(xmcover,xmcover); 342 a0 = _mm_mulhi_epu16(a0, c0); 343 344 __m128i c1 = _mm_loadu_si32 (&lut[ipos.array[2]]); 345 __m128i tmpc1 = _mm_loadu_si32 (&lut[ipos.array[3]]); 346 c1 = _mm_unpacklo_epi32 (c1, tmpc1); 347 c1 = _mm_unpacklo_epi8 (c1, c1); 348 349 __m128i a1 = _mm_unpackhi_epi32(xmcover,xmcover); 350 a1 = _mm_mulhi_epu16(a1, c1); 351 352 // unpack alpha 353 354 a0 = _mm_shufflelo_epi16!255(a0); 355 a0 = _mm_shufflehi_epi16!255(a0); 356 a1 = _mm_shufflelo_epi16!255(a1); 357 a1 = _mm_shufflehi_epi16!255(a1); 358 359 // alpha*source + dest - alpha*dest 360 361 c0 = _mm_mulhi_epu16 (c0,a0); 362 c1 = _mm_mulhi_epu16 (c1,a1); 363 c0 = _mm_add_epi16 (c0,d0); 364 c1 = _mm_add_epi16 (c1,d1); 365 d0 = _mm_mulhi_epu16 (d0,a0); 366 d1 = _mm_mulhi_epu16 (d1,a1); 367 c0 = _mm_sub_epi16 (c0, d0); 368 c1 = _mm_sub_epi16 (c1, d1); 369 c0 = _mm_srli_epi16 (c0,8); 370 c1 = _mm_srli_epi16 (c1,8); 371 372 d0 = _mm_packus_epi16 (c0,c1); 373 374 _mm_store_si128 (cast(__m128i*)ptr,d0); 375 376 bpos++; 377 ptr+=4; 378 dlptr+=4; 379 380 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0) break; 381 } 382 } 383 } 384 385 // Member variables 386 387 uint* pixels; 388 int stride; 389 int height; 390 float xctr,yctr; 391 float xstep0,ystep0; 392 float xstep1,ystep1; 393 Gradient gradient; 394 WindingRule windingRule; 395 RepeatMode repeatMode; 396 float numRepeats; 397 } 398 399 /* 400 helpers for fast atan2 401 these should be inlined by ldc 402 split up into 3 seperate parts because its faster to spread them out 403 in the calling code. Breaks up the instruction dependency somewhat. 404 */ 405 406 private: 407 408 immutable __m128 MINSUM = 0.001; 409 immutable __m128 FQTWO = 0.5; 410 411 __m128 gradOfSorts(__m128 x, __m128 y) 412 { 413 __m128 absx = _mm_and_ps(x, cast(__m128) XMABSMASK); 414 __m128 absy = _mm_and_ps(y, cast(__m128) XMABSMASK); 415 __m128 sum = _mm_add_ps(absx,absy); 416 __m128 diff = _mm_sub_ps(absx,absy); 417 sum = _mm_max_ps(sum,MINSUM); 418 return diff / sum; 419 } 420 421 immutable __m128 PCOEF0 = 0.125f; 422 immutable __m128 PCOEF1 = 0.154761366f; 423 immutable __m128 PCOEF3 = 0.0305494905f; 424 425 __m128 polyAprox(__m128 g) 426 { 427 __m128 sqr = g*g; 428 __m128 p3 = PCOEF3*g; 429 __m128 p1 = PCOEF1*g; 430 return PCOEF0 - p1 + p3*sqr; 431 } 432 433 // lots of casts here due to mixing of int4 and float4 434 435 __m128 fixupQuadrant(__m128 pos, __m128 t0, __m128 t1) 436 { 437 pos = cast(__m128) (cast(__m128i) pos ^ ((cast(__m128i) t0 ^ cast(__m128i) t1) & XMSIGNMASK)); 438 return pos + cast(__m128) (_mm_srai_epi32(cast(__m128i)t0,31) & cast(__m128i) FQTWO); 439 }