MagickCore 6.9.13
Loading...
Searching...
No Matches
accelerate-kernels-private.h
1/*
2 Copyright 1999 ImageMagick Studio LLC, a non-profit organization
3 dedicated to making software imaging solutions freely available.
4
5 You may not use this file except in compliance with the License. You may
6 obtain a copy of the License at
7
8 https://imagemagick.org/script/license.php
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15
16 MagickCore private methods for accelerated functions.
17*/
18
19#ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20#define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21
22#if defined(__cplusplus) || defined(c_plusplus)
23extern "C" {
24#endif
25
26#if defined(MAGICKCORE_OPENCL_SUPPORT)
27
28/*
29 Define declarations.
30*/
31#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32#define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33#define OPENCL_ELSE() "\n #""else " " \n"
34#define OPENCL_ENDIF() "\n #""endif " " \n"
35#define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36#define STRINGIFY(...) #__VA_ARGS__ "\n"
37
38const char* accelerateKernels =
39
40/*
41 Define declarations.
42*/
43 OPENCL_DEFINE(GetPixelAlpha(pixel),(QuantumRange-(pixel).w))
44 OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
45 OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
46 OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
47 OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
48 OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
49 OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
50 OPENCL_DEFINE(SigmaRandom, (attenuate))
51 OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
52 OPENCL_DEFINE(MagickMax(x, y), (((x) > (y)) ? (x) : (y)))
53 OPENCL_DEFINE(MagickMin(x, y), (((x) < (y)) ? (x) : (y)))
54
55/*
56 Typedef declarations.
57*/
58 STRINGIFY(
59 typedef enum
60 {
61 UndefinedColorspace,
62 RGBColorspace, /* Linear RGB colorspace */
63 GRAYColorspace, /* greyscale (non-linear) image (faked 1 channel) */
64 TransparentColorspace,
65 OHTAColorspace,
66 LabColorspace,
67 XYZColorspace,
68 YCbCrColorspace,
69 YCCColorspace,
70 YIQColorspace,
71 YPbPrColorspace,
72 YUVColorspace,
73 CMYKColorspace, /* negared linear RGB with black separated */
74 sRGBColorspace, /* Default: non-lienar sRGB colorspace */
75 HSBColorspace,
76 HSLColorspace,
77 HWBColorspace,
78 Rec601LumaColorspace,
79 Rec601YCbCrColorspace,
80 Rec709LumaColorspace,
81 Rec709YCbCrColorspace,
82 LogColorspace,
83 CMYColorspace, /* negated linear RGB colorspace */
84 LuvColorspace,
85 HCLColorspace,
86 LCHColorspace, /* alias for LCHuv */
87 LMSColorspace,
88 LCHabColorspace, /* Cylindrical (Polar) Lab */
89 LCHuvColorspace, /* Cylindrical (Polar) Luv */
90 scRGBColorspace,
91 HSIColorspace,
92 HSVColorspace, /* alias for HSB */
93 HCLpColorspace,
94 YDbDrColorspace,
95 LinearGRAYColorspace /* greyscale (linear) image (faked 1 channel) */
96 } ColorspaceType;
97 )
98
99 STRINGIFY(
100 typedef enum
101 {
102 UndefinedCompositeOp,
103 NoCompositeOp,
104 ModulusAddCompositeOp,
105 AtopCompositeOp,
106 BlendCompositeOp,
107 BumpmapCompositeOp,
108 ChangeMaskCompositeOp,
109 ClearCompositeOp,
110 ColorBurnCompositeOp,
111 ColorDodgeCompositeOp,
112 ColorizeCompositeOp,
113 CopyBlackCompositeOp,
114 CopyBlueCompositeOp,
115 CopyCompositeOp,
116 CopyCyanCompositeOp,
117 CopyGreenCompositeOp,
118 CopyMagentaCompositeOp,
119 CopyOpacityCompositeOp,
120 CopyRedCompositeOp,
121 CopyYellowCompositeOp,
122 DarkenCompositeOp,
123 DstAtopCompositeOp,
124 DstCompositeOp,
125 DstInCompositeOp,
126 DstOutCompositeOp,
127 DstOverCompositeOp,
128 DifferenceCompositeOp,
129 DisplaceCompositeOp,
130 DissolveCompositeOp,
131 ExclusionCompositeOp,
132 HardLightCompositeOp,
133 HueCompositeOp,
134 InCompositeOp,
135 LightenCompositeOp,
136 LinearLightCompositeOp,
137 LuminizeCompositeOp,
138 MinusDstCompositeOp,
139 ModulateCompositeOp,
140 MultiplyCompositeOp,
141 OutCompositeOp,
142 OverCompositeOp,
143 OverlayCompositeOp,
144 PlusCompositeOp,
145 ReplaceCompositeOp,
146 SaturateCompositeOp,
147 ScreenCompositeOp,
148 SoftLightCompositeOp,
149 SrcAtopCompositeOp,
150 SrcCompositeOp,
151 SrcInCompositeOp,
152 SrcOutCompositeOp,
153 SrcOverCompositeOp,
154 ModulusSubtractCompositeOp,
155 ThresholdCompositeOp,
156 XorCompositeOp,
157 /* These are new operators, added after the above was last sorted.
158 * The list should be re-sorted only when a new library version is
159 * created.
160 */
161 DivideDstCompositeOp,
162 DistortCompositeOp,
163 BlurCompositeOp,
164 PegtopLightCompositeOp,
165 VividLightCompositeOp,
166 PinLightCompositeOp,
167 LinearDodgeCompositeOp,
168 LinearBurnCompositeOp,
169 MathematicsCompositeOp,
170 DivideSrcCompositeOp,
171 MinusSrcCompositeOp,
172 DarkenIntensityCompositeOp,
173 LightenIntensityCompositeOp
174 } CompositeOperator;
175 )
176
177 STRINGIFY(
178 typedef enum
179 {
180 UndefinedFunction,
181 PolynomialFunction,
182 SinusoidFunction,
183 ArcsinFunction,
184 ArctanFunction
185 } MagickFunction;
186 )
187
188 STRINGIFY(
189 typedef enum
190 {
191 UndefinedNoise,
192 UniformNoise,
193 GaussianNoise,
194 MultiplicativeGaussianNoise,
195 ImpulseNoise,
196 LaplacianNoise,
197 PoissonNoise,
198 RandomNoise
199 } NoiseType;
200 )
201
202 STRINGIFY(
203 typedef enum
204 {
205 UndefinedPixelIntensityMethod = 0,
206 AveragePixelIntensityMethod,
207 BrightnessPixelIntensityMethod,
208 LightnessPixelIntensityMethod,
209 Rec601LumaPixelIntensityMethod,
210 Rec601LuminancePixelIntensityMethod,
211 Rec709LumaPixelIntensityMethod,
212 Rec709LuminancePixelIntensityMethod,
213 RMSPixelIntensityMethod,
214 MSPixelIntensityMethod
215 } PixelIntensityMethod;
216 )
217
218 STRINGIFY(
219 typedef enum {
220 BoxWeightingFunction = 0,
221 TriangleWeightingFunction,
222 CubicBCWeightingFunction,
223 HanningWeightingFunction,
224 HammingWeightingFunction,
225 BlackmanWeightingFunction,
226 GaussianWeightingFunction,
227 QuadraticWeightingFunction,
228 JincWeightingFunction,
229 SincWeightingFunction,
230 SincFastWeightingFunction,
231 KaiserWeightingFunction,
232 WelshWeightingFunction,
233 BohmanWeightingFunction,
234 LagrangeWeightingFunction,
235 CosineWeightingFunction,
236 } ResizeWeightingFunctionType;
237 )
238
239 STRINGIFY(
240 typedef enum
241 {
242 UndefinedChannel,
243 RedChannel = 0x0001,
244 GrayChannel = 0x0001,
245 CyanChannel = 0x0001,
246 GreenChannel = 0x0002,
247 MagentaChannel = 0x0002,
248 BlueChannel = 0x0004,
249 YellowChannel = 0x0004,
250 AlphaChannel = 0x0008,
251 OpacityChannel = 0x0008,
252 MatteChannel = 0x0008, /* deprecated */
253 BlackChannel = 0x0020,
254 IndexChannel = 0x0020,
255 CompositeChannels = 0x002F,
256 AllChannels = 0x7ffffff,
257 /*
258 Special purpose channel types.
259 */
260 TrueAlphaChannel = 0x0040, /* extract actual alpha channel from opacity */
261 RGBChannels = 0x0080, /* set alpha from grayscale mask in RGB */
262 GrayChannels = 0x0080,
263 SyncChannels = 0x0100, /* channels should be modified equally */
264 DefaultChannels = ((AllChannels | SyncChannels) &~ OpacityChannel)
265 } ChannelType;
266 )
267
268/*
269 Helper functions.
270*/
271
272OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
273
274 STRINGIFY(
275 static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
276 {
277 return((CLQuantum) value);
278 }
279 )
280
281OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
282
283 STRINGIFY(
284 static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
285 {
286 return((CLQuantum) (257.0f*value));
287 }
288 )
289
290OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
291
292 STRINGIFY(
293 static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
294 {
295 return((CLQuantum) (16843009.0*value));
296 }
297 )
298
299OPENCL_ENDIF()
300
301OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
302
303 STRINGIFY(
304 static inline CLQuantum ClampToQuantum(const float value)
305 {
306 return (CLQuantum) value;
307 }
308 )
309
310OPENCL_ELSE()
311
312 STRINGIFY(
313 static inline CLQuantum ClampToQuantum(const float value)
314 {
315 return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
316 }
317 )
318
319OPENCL_ENDIF()
320
321 STRINGIFY(
322 static inline int ClampToCanvas(const int offset, const int range)
323 {
324 return clamp(offset, (int)0, range - 1);
325 }
326 )
327
328 STRINGIFY(
329 static inline int ClampToCanvasWithHalo(const int offset, const int range, const int edge, const int section)
330 {
331 return clamp(offset, section ? (int)(0 - edge) : (int)0, section ? (range - 1) : (range - 1 + edge));
332 }
333 )
334
335 STRINGIFY(
336 static inline uint ScaleQuantumToMap(CLQuantum value)
337 {
338 if (value >= (CLQuantum)MaxMap)
339 return ((uint)MaxMap);
340 else
341 return ((uint)value);
342 }
343 )
344
345 STRINGIFY(
346 static inline float PerceptibleReciprocal(const float x)
347 {
348 float sign = x < (float) 0.0 ? (float)-1.0 : (float) 1.0;
349 return((sign*x) >= MagickEpsilon ? (float) 1.0 / x : sign*((float) 1.0 / MagickEpsilon));
350 }
351 )
352
353 STRINGIFY(
354 static inline float RoundToUnity(const float value)
355 {
356 return clamp(value, 0.0f, 1.0f);
357 }
358 )
359
360 STRINGIFY(
361
362 static inline CLQuantum getBlue(CLPixelType p) { return p.x; }
363 static inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
364 static inline float getBlueF4(float4 p) { return p.x; }
365 static inline void setBlueF4(float4* p, float value) { (*p).x = value; }
366
367 static inline CLQuantum getGreen(CLPixelType p) { return p.y; }
368 static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
369 static inline float getGreenF4(float4 p) { return p.y; }
370 static inline void setGreenF4(float4* p, float value) { (*p).y = value; }
371
372 static inline CLQuantum getRed(CLPixelType p) { return p.z; }
373 static inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
374 static inline float getRedF4(float4 p) { return p.z; }
375 static inline void setRedF4(float4* p, float value) { (*p).z = value; }
376
377 static inline CLQuantum getOpacity(CLPixelType p) { return p.w; }
378 static inline void setOpacity(CLPixelType* p, CLQuantum value) { (*p).w = value; }
379 static inline float getOpacityF4(float4 p) { return p.w; }
380 static inline void setOpacityF4(float4* p, float value) { (*p).w = value; }
381
382 static inline void setGray(CLPixelType* p, CLQuantum value) { (*p).z = value; (*p).y = value; (*p).x = value; }
383
384 static inline float GetPixelIntensity(const int method, const int colorspace, CLPixelType p)
385 {
386 float red = getRed(p);
387 float green = getGreen(p);
388 float blue = getBlue(p);
389
390 float intensity;
391
392 if (colorspace == GRAYColorspace)
393 return red;
394
395 switch (method)
396 {
397 case AveragePixelIntensityMethod:
398 {
399 intensity = (red + green + blue) / 3.0;
400 break;
401 }
402 case BrightnessPixelIntensityMethod:
403 {
404 intensity = MagickMax(MagickMax(red, green), blue);
405 break;
406 }
407 case LightnessPixelIntensityMethod:
408 {
409 intensity = (MagickMin(MagickMin(red, green), blue) +
410 MagickMax(MagickMax(red, green), blue)) / 2.0;
411 break;
412 }
413 case MSPixelIntensityMethod:
414 {
415 intensity = (float)(((float)red*red + green*green + blue*blue) /
416 (3.0*QuantumRange));
417 break;
418 }
419 case Rec601LumaPixelIntensityMethod:
420 {
421 /*
422 if (image->colorspace == RGBColorspace)
423 {
424 red=EncodePixelGamma(red);
425 green=EncodePixelGamma(green);
426 blue=EncodePixelGamma(blue);
427 }
428 */
429 intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
430 break;
431 }
432 case Rec601LuminancePixelIntensityMethod:
433 {
434 /*
435 if (image->colorspace == sRGBColorspace)
436 {
437 red=DecodePixelGamma(red);
438 green=DecodePixelGamma(green);
439 blue=DecodePixelGamma(blue);
440 }
441 */
442 intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
443 break;
444 }
445 case Rec709LumaPixelIntensityMethod:
446 default:
447 {
448 /*
449 if (image->colorspace == RGBColorspace)
450 {
451 red=EncodePixelGamma(red);
452 green=EncodePixelGamma(green);
453 blue=EncodePixelGamma(blue);
454 }
455 */
456 intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
457 break;
458 }
459 case Rec709LuminancePixelIntensityMethod:
460 {
461 /*
462 if (image->colorspace == sRGBColorspace)
463 {
464 red=DecodePixelGamma(red);
465 green=DecodePixelGamma(green);
466 blue=DecodePixelGamma(blue);
467 }
468 */
469 intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
470 break;
471 }
472 case RMSPixelIntensityMethod:
473 {
474 intensity = (float)(sqrt((float)red*red + green*green + blue*blue) /
475 sqrt(3.0));
476 break;
477 }
478 }
479
480 return intensity;
481
482 }
483 )
484
485/*
486%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
487% %
488% %
489% %
490% A d d N o i s e %
491% %
492% %
493% %
494%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
495*/
496
497STRINGIFY(
498
499/*
500Part of MWC64X by David Thomas, dt10@imperial.ac.uk
501This is provided under BSD, full license is with the main package.
502See http://www.doc.ic.ac.uk/~dt10/research
503*/
504
505// Pre: a<M, b<M
506// Post: r=(a+b) mod M
507ulong MWC_AddMod64(ulong a, ulong b, ulong M)
508{
509 ulong v=a+b;
510 //if( (v>=M) || (v<a) )
511 if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
512 v=v-M;
513 return v;
514}
515
516// Pre: a<M,b<M
517// Post: r=(a*b) mod M
518// This could be done more efficently, but it is portable, and should
519// be easy to understand. It can be replaced with any of the better
520// modular multiplication algorithms (for example if you know you have
521// double precision available or something).
522ulong MWC_MulMod64(ulong a, ulong b, ulong M)
523{
524 ulong r=0;
525 while(a!=0){
526 if(a&1)
527 r=MWC_AddMod64(r,b,M);
528 b=MWC_AddMod64(b,b,M);
529 a=a>>1;
530 }
531 return r;
532}
533
534
535// Pre: a<M, e>=0
536// Post: r=(a^b) mod M
537// This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
538// most architectures
539ulong MWC_PowMod64(ulong a, ulong e, ulong M)
540{
541 ulong sqr=a, acc=1;
542 while(e!=0){
543 if(e&1)
544 acc=MWC_MulMod64(acc,sqr,M);
545 sqr=MWC_MulMod64(sqr,sqr,M);
546 e=e>>1;
547 }
548 return acc;
549}
550
551uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
552{
553 ulong m=MWC_PowMod64(A, distance, M);
554 ulong x=curr.x*(ulong)A+curr.y;
555 x=MWC_MulMod64(x, m, M);
556 return (uint2)((uint)(x/A), (uint)(x%A));
557}
558
559uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
560{
561 // This is an arbitrary constant for starting LCG jumping from. I didn't
562 // want to start from 1, as then you end up with the two or three first values
563 // being a bit poor in ones - once you've decided that, one constant is as
564 // good as any another. There is no deep mathematical reason for it, I just
565 // generated a random number.
566 enum{ MWC_BASEID = 4077358422479273989UL };
567
568 ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
569 ulong m=MWC_PowMod64(A, dist, M);
570
571 ulong x=MWC_MulMod64(MWC_BASEID, m, M);
572 return (uint2)((uint)(x/A), (uint)(x%A));
573}
574
576typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
577
578void MWC64X_Step(mwc64x_state_t *s)
579{
580 uint X=s->x, C=s->c;
581
582 uint Xn=s->seed0*X+C;
583 uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
584 uint Cn=mad_hi(s->seed0,X,carry);
585
586 s->x=Xn;
587 s->c=Cn;
588}
589
590void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
591{
592 uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
593 s->x=tmp.x;
594 s->c=tmp.y;
595}
596
597void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
598{
599 uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
600 s->x=tmp.x;
601 s->c=tmp.y;
602}
603
605uint MWC64X_NextUint(mwc64x_state_t *s)
606{
607 uint res=s->x ^ s->c;
608 MWC64X_Step(s);
609 return res;
610}
611
612//
613// End of MWC64X excerpt
614//
615
616 float mwcReadPseudoRandomValue(mwc64x_state_t* rng) {
617 return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
618 }
619
620
621 float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
622
623 float
624 alpha,
625 beta,
626 noise,
627 sigma;
628
629 noise = 0.0f;
630 alpha=mwcReadPseudoRandomValue(r);
631 switch(noise_type) {
632 case UniformNoise:
633 default:
634 {
635 noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
636 break;
637 }
638 case GaussianNoise:
639 {
640 float
641 gamma,
642 tau;
643
644 if (alpha == 0.0f)
645 alpha=1.0f;
646 beta=mwcReadPseudoRandomValue(r);
647 gamma=sqrt(-2.0f*log(alpha));
648 sigma=gamma*cospi((2.0f*beta));
649 tau=gamma*sinpi((2.0f*beta));
650 noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
651 QuantumRange*TauGaussian*tau);
652 break;
653 }
654
655
656 case ImpulseNoise:
657 {
658 if (alpha < (SigmaImpulse/2.0f))
659 noise=0.0f;
660 else
661 if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
662 noise=(float)QuantumRange;
663 else
664 noise=(float)pixel;
665 break;
666 }
667 case LaplacianNoise:
668 {
669 if (alpha <= 0.5f)
670 {
671 if (alpha <= MagickEpsilon)
672 noise=(float) (pixel-QuantumRange);
673 else
674 noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
675 0.5f);
676 break;
677 }
678 beta=1.0f-alpha;
679 if (beta <= (0.5f*MagickEpsilon))
680 noise=(float) (pixel+QuantumRange);
681 else
682 noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
683 break;
684 }
685 case MultiplicativeGaussianNoise:
686 {
687 sigma=1.0f;
688 if (alpha > MagickEpsilon)
689 sigma=sqrt(-2.0f*log(alpha));
690 beta=mwcReadPseudoRandomValue(r);
691 noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
692 cospi((float) (2.0f*beta))/2.0f);
693 break;
694 }
695 case PoissonNoise:
696 {
697 float
698 poisson;
699 unsigned int i;
700 poisson=exp(-SigmaPoisson*QuantumScale*pixel);
701 for (i=0; alpha > poisson; i++)
702 {
703 beta=mwcReadPseudoRandomValue(r);
704 alpha*=beta;
705 }
706 noise=(float) (QuantumRange*i*PerceptibleReciprocal(SigmaPoisson));
707 break;
708 }
709 case RandomNoise:
710 {
711 noise=(float) (QuantumRange*SigmaRandom*alpha);
712 break;
713 }
714
715 };
716 return noise;
717 }
718
719 __kernel
720 void AddNoise(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
721 ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem
722 ,const ChannelType channel
723 ,const NoiseType noise_type, const float attenuate
724 ,const unsigned int seed0, const unsigned int seed1
725 ,const unsigned int numRandomNumbersPerPixel) {
726
727 mwc64x_state_t rng;
728 rng.seed0 = seed0;
729 rng.seed1 = seed1;
730
731 uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
732 uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
733
734 MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
735
736 uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0); // pixel to process
737
738 uint count = pixelsPerWorkItem;
739
740 while (count > 0) {
741 if (pos < inputPixelCount) {
742 CLPixelType p = inputImage[pos];
743
744 if ((channel&RedChannel)!=0) {
745 setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate)));
746 }
747
748 if ((channel&GreenChannel)!=0) {
749 setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate)));
750 }
751
752 if ((channel&BlueChannel)!=0) {
753 setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate)));
754 }
755
756 if ((channel & OpacityChannel) != 0) {
757 setOpacity(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getOpacity(p),noise_type,attenuate)));
758 }
759
760 filteredImage[pos] = p;
761 //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255);
762 }
763 pos += get_local_size(0);
764 --count;
765 }
766 }
767 )
768
769/*
770%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
771% %
772% %
773% %
774% B l u r %
775% %
776% %
777% %
778%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
779*/
780
781 STRINGIFY(
782 /*
783 Reduce image noise and reduce detail levels by row
784 im: input pixels filtered_in filtered_im: output pixels
785 filter : convolve kernel width: convolve kernel size
786 channel : define which channel is blured
787 is_RGBA_BGRA : define the input is RGBA or BGRA
788 */
789 __kernel void BlurRow(__global CLPixelType *im, __global float4 *filtered_im,
790 const ChannelType channel, __constant float *filter,
791 const unsigned int width,
792 const unsigned int imageColumns, const unsigned int imageRows,
793 __local CLPixelType *temp)
794 {
795 const int x = get_global_id(0);
796 const int y = get_global_id(1);
797
798 const int columns = imageColumns;
799
800 const unsigned int radius = (width-1)/2;
801 const int wsize = get_local_size(0);
802 const unsigned int loadSize = wsize+width;
803
804 //load chunk only for now
805 //event_t e = async_work_group_copy(temp+radius, im+x+y*columns, wsize, 0);
806 //wait_group_events(1,&e);
807
808 //parallel load and clamp
809 /*
810 int count = 0;
811 for (int i=0; i < loadSize; i=i+wsize)
812 {
813 int currentX = x + wsize*(count++);
814
815 int localId = get_local_id(0);
816
817 if ((localId+i) > loadSize)
818 break;
819
820 temp[localId+i] = im[y*columns+ClampToCanvas(currentX-radius, columns)];
821
822 if (y==0 && get_group_id(0) == 0)
823 {
824 printf("(%d %d) temp %d load %d currentX %d\n", x, y, localId+i, ClampToCanvas(currentX-radius, columns), currentX);
825 }
826 }
827 */
828
829 //group coordinate
830 const int groupX=get_local_size(0)*get_group_id(0);
831 const int groupY=get_local_size(1)*get_group_id(1);
832
833 //parallel load and clamp
834 for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
835 {
836 //int cx = ClampToCanvas(groupX+i, columns);
837 temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
838
839 /*if (0 && y==0 && get_group_id(1) == 0)
840 {
841 printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
842 }*/
843 }
844
845 // barrier
846 barrier(CLK_LOCAL_MEM_FENCE);
847
848 // only do the work if this is not a patched item
849 if (get_global_id(0) < columns)
850 {
851 // compute
852 float4 result = (float4) 0;
853
854 int i = 0;
855
856 \n #ifndef UFACTOR \n
857 \n #define UFACTOR 8 \n
858 \n #endif \n
859
860 for ( ; i+UFACTOR < width; )
861 {
862 \n #pragma unroll UFACTOR\n
863 for (int j=0; j < UFACTOR; j++, i++)
864 {
865 result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
866 }
867 }
868
869 for ( ; i < width; i++)
870 {
871 result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
872 }
873
874 result.x = ClampToQuantum(result.x);
875 result.y = ClampToQuantum(result.y);
876 result.z = ClampToQuantum(result.z);
877 result.w = ClampToQuantum(result.w);
878
879 // write back to global
880 filtered_im[y*columns+x] = result;
881 }
882 }
883 )
884
885 STRINGIFY(
886 /*
887 Reduce image noise and reduce detail levels by line
888 im: input pixels filtered_in filtered_im: output pixels
889 filter : convolve kernel width: convolve kernel size
890 channel : define which channel is blured\
891 is_RGBA_BGRA : define the input is RGBA or BGRA
892 */
893 __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
894 const ChannelType channel, __constant float *filter,
895 const unsigned int width,
896 const unsigned int imageColumns, const unsigned int imageRows,
897 __local float4 *temp)
898 {
899 const int x = get_global_id(0);
900 const int y = get_global_id(1);
901
902 //const int columns = get_global_size(0);
903 //const int rows = get_global_size(1);
904 const int columns = imageColumns;
905 const int rows = imageRows;
906
907 unsigned int radius = (width-1)/2;
908 const int wsize = get_local_size(1);
909 const unsigned int loadSize = wsize+width;
910
911 //group coordinate
912 const int groupX=get_local_size(0)*get_group_id(0);
913 const int groupY=get_local_size(1)*get_group_id(1);
914 //notice that get_local_size(0) is 1, so
915 //groupX=get_group_id(0);
916
917 //parallel load and clamp
918 for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
919 {
920 temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
921 }
922
923 // barrier
924 barrier(CLK_LOCAL_MEM_FENCE);
925
926 // only do the work if this is not a patched item
927 if (get_global_id(1) < rows)
928 {
929 // compute
930 float4 result = (float4) 0;
931
932 int i = 0;
933
934 \n #ifndef UFACTOR \n
935 \n #define UFACTOR 8 \n
936 \n #endif \n
937
938 for ( ; i+UFACTOR < width; )
939 {
940 \n #pragma unroll UFACTOR \n
941 for (int j=0; j < UFACTOR; j++, i++)
942 {
943 result+=filter[i]*temp[i+get_local_id(1)];
944 }
945 }
946
947 for ( ; i < width; i++)
948 {
949 result+=filter[i]*temp[i+get_local_id(1)];
950 }
951
952 result.x = ClampToQuantum(result.x);
953 result.y = ClampToQuantum(result.y);
954 result.z = ClampToQuantum(result.z);
955 result.w = ClampToQuantum(result.w);
956
957 // write back to global
958 filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
959 }
960
961 }
962 )
963
964/*
965%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
966% %
967% %
968% %
969% C o m p o s i t e %
970% %
971% %
972% %
973%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
974*/
975
976 STRINGIFY(
977 static inline float ColorDodge(const float Sca,
978 const float Sa,const float Dca,const float Da)
979 {
980 /*
981 Oct 2004 SVG specification.
982 */
983 if ((Sca*Da+Dca*Sa) >= Sa*Da)
984 return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
985 return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
986
987
988 /*
989 New specification, March 2009 SVG specification. This specification was
990 also wrong of non-overlap cases.
991 */
992 /*
993 if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
994 return(Sca*(1.0-Da));
995 if (fabs(Sca-Sa) < MagickEpsilon)
996 return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
997 return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
998 */
999
1000 /*
1001 Working from first principles using the original formula:
1002
1003 f(Sc,Dc) = Dc/(1-Sc)
1004
1005 This works correctly! Looks like the 2004 model was right but just
1006 required a extra condition for correct handling.
1007 */
1008
1009 /*
1010 if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
1011 return(Sca*(1.0-Da)+Dca*(1.0-Sa));
1012 if (fabs(Sca-Sa) < MagickEpsilon)
1013 return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
1014 return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
1015 */
1016 }
1017
1018 static inline void CompositeColorDodge(const float4 *p,
1019 const float4 *q,float4 *composite) {
1020
1021 float
1022 Da,
1023 gamma,
1024 Sa;
1025
1026 Sa=1.0f-QuantumScale*getOpacityF4(*p); /* simplify and speed up equations */
1027 Da=1.0f-QuantumScale*getOpacityF4(*q);
1028 gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
1029 setOpacityF4(composite, QuantumRange*(1.0-gamma));
1030 gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
1031 setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
1032 getRedF4(*q)*Da,Da));
1033 setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
1034 getGreenF4(*q)*Da,Da));
1035 setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
1036 getBlueF4(*q)*Da,Da));
1037 }
1038 )
1039
1040 STRINGIFY(
1041 static inline void MagickPixelCompositePlus(const float4 *p,
1042 const float alpha,const float4 *q,
1043 const float beta,float4 *composite)
1044 {
1045 float
1046 gamma;
1047
1048 float
1049 Da,
1050 Sa;
1051 /*
1052 Add two pixels with the given opacities.
1053 */
1054 Sa=1.0-QuantumScale*alpha;
1055 Da=1.0-QuantumScale*beta;
1056 gamma=RoundToUnity(Sa+Da); /* 'Plus' blending -- not 'Over' blending */
1057 setOpacityF4(composite,(float) QuantumRange*(1.0-gamma));
1058 gamma=PerceptibleReciprocal(gamma);
1059 setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
1060 setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
1061 setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
1062 }
1063 )
1064
1065 STRINGIFY(
1066 static inline void MagickPixelCompositeBlend(const float4 *p,
1067 const float alpha,const float4 *q,
1068 const float beta,float4 *composite)
1069 {
1070 MagickPixelCompositePlus(p,(float) (QuantumRange-alpha*
1071 (QuantumRange-getOpacityF4(*p))),q,(float) (QuantumRange-beta*
1072 (QuantumRange-getOpacityF4(*q))),composite);
1073 }
1074 )
1075
1076 STRINGIFY(
1077 __kernel
1078 void Composite(__global CLPixelType *image,
1079 const unsigned int imageWidth,
1080 const unsigned int imageHeight,
1081 const unsigned int imageMatte,
1082 const __global CLPixelType *compositeImage,
1083 const unsigned int compositeWidth,
1084 const unsigned int compositeHeight,
1085 const unsigned int compositeMatte,
1086 const unsigned int compose,
1087 const ChannelType channel,
1088 const float destination_dissolve,
1089 const float source_dissolve) {
1090
1091 uint2 index;
1092 index.x = get_global_id(0);
1093 index.y = get_global_id(1);
1094
1095
1096 if (index.x >= imageWidth
1097 || index.y >= imageHeight) {
1098 return;
1099 }
1100 const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
1101 float4 destination;
1102 setRedF4(&destination,getRed(inputPixel));
1103 setGreenF4(&destination,getGreen(inputPixel));
1104 setBlueF4(&destination,getBlue(inputPixel));
1105
1106
1107 const CLPixelType compositePixel
1108 = compositeImage[index.y*imageWidth+index.x];
1109 float4 source;
1110 setRedF4(&source,getRed(compositePixel));
1111 setGreenF4(&source,getGreen(compositePixel));
1112 setBlueF4(&source,getBlue(compositePixel));
1113
1114 if (imageMatte != 0) {
1115 setOpacityF4(&destination,getOpacity(inputPixel));
1116 }
1117 else {
1118 setOpacityF4(&destination,0.0f);
1119 }
1120
1121 if (compositeMatte != 0) {
1122 setOpacityF4(&source,getOpacity(compositePixel));
1123 }
1124 else {
1125 setOpacityF4(&source,0.0f);
1126 }
1127
1128 float4 composite=destination;
1129
1130 CompositeOperator op = (CompositeOperator)compose;
1131 switch (op) {
1132 case ColorDodgeCompositeOp:
1133 CompositeColorDodge(&source,&destination,&composite);
1134 break;
1135 case BlendCompositeOp:
1136 MagickPixelCompositeBlend(&source,source_dissolve,&destination,
1137 destination_dissolve,&composite);
1138 break;
1139 default:
1140 // unsupported operators
1141 break;
1142 };
1143
1144 CLPixelType outputPixel;
1145 setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
1146 setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
1147 setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
1148 setOpacity(&outputPixel, ClampToQuantum(getOpacityF4(composite)));
1149 image[index.y*imageWidth+index.x] = outputPixel;
1150 }
1151 )
1152
1153/*
1154%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1155% %
1156% %
1157% %
1158% C o n t r a s t %
1159% %
1160% %
1161% %
1162%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1163*/
1164
1165 STRINGIFY(
1166
1167 static inline float3 ConvertRGBToHSB(CLPixelType pixel) {
1168 float3 HueSaturationBrightness;
1169 HueSaturationBrightness.x = 0.0f; // Hue
1170 HueSaturationBrightness.y = 0.0f; // Saturation
1171 HueSaturationBrightness.z = 0.0f; // Brightness
1172
1173 float r=(float) getRed(pixel);
1174 float g=(float) getGreen(pixel);
1175 float b=(float) getBlue(pixel);
1176
1177 float tmin=MagickMin(MagickMin(r,g),b);
1178 float tmax= MagickMax(MagickMax(r,g),b);
1179
1180 if (tmax!=0.0f) {
1181 float delta=tmax-tmin;
1182 HueSaturationBrightness.y=delta/tmax;
1183 HueSaturationBrightness.z=QuantumScale*tmax;
1184
1185 if (delta != 0.0f) {
1186 HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1187 HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1188 HueSaturationBrightness.x/=6.0f;
1189 HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1190 }
1191 }
1192 return HueSaturationBrightness;
1193 }
1194
1195 static inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1196
1197 float hue = HueSaturationBrightness.x;
1198 float brightness = HueSaturationBrightness.z;
1199 float saturation = HueSaturationBrightness.y;
1200
1201 CLPixelType rgb;
1202
1203 if (saturation == 0.0f) {
1204 setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1205 setGreen(&rgb,getRed(rgb));
1206 setBlue(&rgb,getRed(rgb));
1207 }
1208 else {
1209
1210 float h=6.0f*(hue-floor(hue));
1211 float f=h-floor(h);
1212 float p=brightness*(1.0f-saturation);
1213 float q=brightness*(1.0f-saturation*f);
1214 float t=brightness*(1.0f-(saturation*(1.0f-f)));
1215
1216 float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1217 float clamped_t = ClampToQuantum(QuantumRange*t);
1218 float clamped_p = ClampToQuantum(QuantumRange*p);
1219 float clamped_q = ClampToQuantum(QuantumRange*q);
1220 int ih = (int)h;
1221 setRed(&rgb, (ih == 1)?clamped_q:
1222 (ih == 2 || ih == 3)?clamped_p:
1223 (ih == 4)?clamped_t:
1224 clampedBrightness);
1225
1226 setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1227 (ih == 3)?clamped_q:
1228 (ih == 4 || ih == 5)?clamped_p:
1229 clamped_t);
1230
1231 setBlue(&rgb, (ih == 2)?clamped_t:
1232 (ih == 3 || ih == 4)?clampedBrightness:
1233 (ih == 5)?clamped_q:
1234 clamped_p);
1235 }
1236 return rgb;
1237 }
1238
1239 __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1240 {
1241
1242 const int sign = sharpen!=0?1:-1;
1243 const int x = get_global_id(0);
1244 const int y = get_global_id(1);
1245 const int columns = get_global_size(0);
1246 const int c = x + y * columns;
1247
1248 CLPixelType pixel = im[c];
1249 float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1250 float brightness = HueSaturationBrightness.z;
1251 brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1252 brightness = clamp(brightness,0.0f,1.0f);
1253 HueSaturationBrightness.z = brightness;
1254
1255 CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1256 filteredPixel.w = pixel.w;
1257 im[c] = filteredPixel;
1258 }
1259 )
1260
1261/*
1262%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1263% %
1264% %
1265% %
1266% C o n t r a s t S t r e t c h %
1267% %
1268% %
1269% %
1270%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1271*/
1272
1273 STRINGIFY(
1274 /*
1275 */
1276 __kernel void Histogram(__global CLPixelType * restrict im,
1277 const ChannelType channel,
1278 const int method,
1279 const int colorspace,
1280 __global uint4 * restrict histogram)
1281 {
1282 const int x = get_global_id(0);
1283 const int y = get_global_id(1);
1284 const int columns = get_global_size(0);
1285 const int c = x + y * columns;
1286 if ((channel & SyncChannels) != 0)
1287 {
1288 float intensity = GetPixelIntensity(method, colorspace,im[c]);
1289 uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1290 atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1291 }
1292 else
1293 {
1294 // for equalizing, we always need all channels?
1295 // otherwise something more
1296 }
1297 }
1298 )
1299
1300 STRINGIFY(
1301 /*
1302 */
1303 __kernel void ContrastStretch(__global CLPixelType * restrict im,
1304 const ChannelType channel,
1305 __global CLPixelType * restrict stretch_map,
1306 const float4 white, const float4 black)
1307 {
1308 const int x = get_global_id(0);
1309 const int y = get_global_id(1);
1310 const int columns = get_global_size(0);
1311 const int c = x + y * columns;
1312
1313 uint ePos;
1314 CLPixelType oValue, eValue;
1315 CLQuantum red, green, blue, opacity;
1316
1317 //read from global
1318 oValue=im[c];
1319
1320 if ((channel & RedChannel) != 0)
1321 {
1322 if (getRedF4(white) != getRedF4(black))
1323 {
1324 ePos = ScaleQuantumToMap(getRed(oValue));
1325 eValue = stretch_map[ePos];
1326 red = getRed(eValue);
1327 }
1328 }
1329
1330 if ((channel & GreenChannel) != 0)
1331 {
1332 if (getGreenF4(white) != getGreenF4(black))
1333 {
1334 ePos = ScaleQuantumToMap(getGreen(oValue));
1335 eValue = stretch_map[ePos];
1336 green = getGreen(eValue);
1337 }
1338 }
1339
1340 if ((channel & BlueChannel) != 0)
1341 {
1342 if (getBlueF4(white) != getBlueF4(black))
1343 {
1344 ePos = ScaleQuantumToMap(getBlue(oValue));
1345 eValue = stretch_map[ePos];
1346 blue = getBlue(eValue);
1347 }
1348 }
1349
1350 if ((channel & OpacityChannel) != 0)
1351 {
1352 if (getOpacityF4(white) != getOpacityF4(black))
1353 {
1354 ePos = ScaleQuantumToMap(getOpacity(oValue));
1355 eValue = stretch_map[ePos];
1356 opacity = getOpacity(eValue);
1357 }
1358 }
1359
1360 //write back
1361 im[c]=(CLPixelType)(blue, green, red, opacity);
1362
1363 }
1364 )
1365
1366/*
1367%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1368% %
1369% %
1370% %
1371% C o n v o l v e %
1372% %
1373% %
1374% %
1375%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1376*/
1377
1378 STRINGIFY(
1379 __kernel
1380 void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1381 const unsigned int imageWidth, const unsigned int imageHeight,
1382 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1383 const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1384
1385 int2 blockID;
1386 blockID.x = get_group_id(0);
1387 blockID.y = get_group_id(1);
1388
1389 // image area processed by this workgroup
1390 int2 imageAreaOrg;
1391 imageAreaOrg.x = blockID.x * get_local_size(0);
1392 imageAreaOrg.y = blockID.y * get_local_size(1);
1393
1394 int2 midFilterDimen;
1395 midFilterDimen.x = (filterWidth-1)/2;
1396 midFilterDimen.y = (filterHeight-1)/2;
1397
1398 int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1399
1400 // dimension of the local cache
1401 int2 cachedAreaDimen;
1402 cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1403 cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1404
1405 // cache the pixels accessed by this workgroup in local memory
1406 int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1407 int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1408 int groupSize = get_local_size(0) * get_local_size(1);
1409 for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1410
1411 int2 cachedAreaIndex;
1412 cachedAreaIndex.x = i % cachedAreaDimen.x;
1413 cachedAreaIndex.y = i / cachedAreaDimen.x;
1414
1415 int2 imagePixelIndex;
1416 imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1417
1418 // only support EdgeVirtualPixelMethod through ClampToCanvas
1419 // TODO: implement other virtual pixel method
1420 imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1421 imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1422
1423 pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1424 }
1425
1426 // cache the filter
1427 for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1428 filterCache[i] = filter[i];
1429 }
1430 barrier(CLK_LOCAL_MEM_FENCE);
1431
1432
1433 int2 imageIndex;
1434 imageIndex.x = imageAreaOrg.x + get_local_id(0);
1435 imageIndex.y = imageAreaOrg.y + get_local_id(1);
1436
1437 // if out-of-range, stops here and quit
1438 if (imageIndex.x >= imageWidth
1439 || imageIndex.y >= imageHeight) {
1440 return;
1441 }
1442
1443 int filterIndex = 0;
1444 float4 sum = (float4)0.0f;
1445 float gamma = 0.0f;
1446 if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1447 int cacheIndexY = get_local_id(1);
1448 for (int j = 0; j < filterHeight; j++) {
1449 int cacheIndexX = get_local_id(0);
1450 for (int i = 0; i < filterWidth; i++) {
1451 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1452 float f = filterCache[filterIndex];
1453
1454 sum.x += f * p.x;
1455 sum.y += f * p.y;
1456 sum.z += f * p.z;
1457 sum.w += f * p.w;
1458
1459 gamma += f;
1460 filterIndex++;
1461 cacheIndexX++;
1462 }
1463 cacheIndexY++;
1464 }
1465 }
1466 else {
1467 int cacheIndexY = get_local_id(1);
1468 for (int j = 0; j < filterHeight; j++) {
1469 int cacheIndexX = get_local_id(0);
1470 for (int i = 0; i < filterWidth; i++) {
1471
1472 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1473 float alpha = QuantumScale*(QuantumRange-p.w);
1474 float f = filterCache[filterIndex];
1475 float g = alpha * f;
1476
1477 sum.x += g*p.x;
1478 sum.y += g*p.y;
1479 sum.z += g*p.z;
1480 sum.w += f*p.w;
1481
1482 gamma += g;
1483 filterIndex++;
1484 cacheIndexX++;
1485 }
1486 cacheIndexY++;
1487 }
1488 gamma = PerceptibleReciprocal(gamma);
1489 sum.xyz = gamma*sum.xyz;
1490 }
1491 CLPixelType outputPixel;
1492 outputPixel.x = ClampToQuantum(sum.x);
1493 outputPixel.y = ClampToQuantum(sum.y);
1494 outputPixel.z = ClampToQuantum(sum.z);
1495 outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1496
1497 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1498 }
1499 )
1500
1501 STRINGIFY(
1502 __kernel
1503 void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1504 const uint imageWidth, const uint imageHeight,
1505 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1506 const uint matte, const ChannelType channel) {
1507
1508 int2 imageIndex;
1509 imageIndex.x = get_global_id(0);
1510 imageIndex.y = get_global_id(1);
1511
1512 /*
1513 unsigned int imageWidth = get_global_size(0);
1514 unsigned int imageHeight = get_global_size(1);
1515 */
1516 if (imageIndex.x >= imageWidth
1517 || imageIndex.y >= imageHeight)
1518 return;
1519
1520 int2 midFilterDimen;
1521 midFilterDimen.x = (filterWidth-1)/2;
1522 midFilterDimen.y = (filterHeight-1)/2;
1523
1524 int filterIndex = 0;
1525 float4 sum = (float4)0.0f;
1526 float gamma = 0.0f;
1527 if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1528 for (int j = 0; j < filterHeight; j++) {
1529 int2 inputPixelIndex;
1530 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1531 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1532 for (int i = 0; i < filterWidth; i++) {
1533 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1534 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1535
1536 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1537 float f = filter[filterIndex];
1538
1539 sum.x += f * p.x;
1540 sum.y += f * p.y;
1541 sum.z += f * p.z;
1542 sum.w += f * p.w;
1543
1544 gamma += f;
1545
1546 filterIndex++;
1547 }
1548 }
1549 }
1550 else {
1551
1552 for (int j = 0; j < filterHeight; j++) {
1553 int2 inputPixelIndex;
1554 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1555 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1556 for (int i = 0; i < filterWidth; i++) {
1557 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1558 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1559
1560 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1561 float alpha = QuantumScale*(QuantumRange-p.w);
1562 float f = filter[filterIndex];
1563 float g = alpha * f;
1564
1565 sum.x += g*p.x;
1566 sum.y += g*p.y;
1567 sum.z += g*p.z;
1568 sum.w += f*p.w;
1569
1570 gamma += g;
1571
1572
1573 filterIndex++;
1574 }
1575 }
1576 gamma = PerceptibleReciprocal(gamma);
1577 sum.xyz = gamma*sum.xyz;
1578 }
1579
1580 CLPixelType outputPixel;
1581 outputPixel.x = ClampToQuantum(sum.x);
1582 outputPixel.y = ClampToQuantum(sum.y);
1583 outputPixel.z = ClampToQuantum(sum.z);
1584 outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1585
1586 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1587 }
1588 )
1589
1590/*
1591%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1592% %
1593% %
1594% %
1595% D e s p e c k l e %
1596% %
1597% %
1598% %
1599%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1600*/
1601
1602 STRINGIFY(
1603
1604 __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1605 , const unsigned int imageWidth, const unsigned int imageHeight
1606 , const int2 offset, const int polarity, const int matte) {
1607
1608 int x = get_global_id(0);
1609 int y = get_global_id(1);
1610
1611 CLPixelType v = inputImage[y*imageWidth+x];
1612
1613 int2 neighbor;
1614 neighbor.y = y + offset.y;
1615 neighbor.x = x + offset.x;
1616
1617 int2 clampedNeighbor;
1618 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1619 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1620
1621 CLPixelType r = (clampedNeighbor.x == neighbor.x
1622 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1623 :(CLPixelType)0;
1624
1625 int sv[4];
1626 sv[0] = (int)v.x;
1627 sv[1] = (int)v.y;
1628 sv[2] = (int)v.z;
1629 sv[3] = (int)v.w;
1630
1631 int sr[4];
1632 sr[0] = (int)r.x;
1633 sr[1] = (int)r.y;
1634 sr[2] = (int)r.z;
1635 sr[3] = (int)r.w;
1636
1637 if (polarity > 0) {
1638 \n #pragma unroll 4\n
1639 for (unsigned int i = 0; i < 4; i++) {
1640 sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1641 }
1642 }
1643 else {
1644 \n #pragma unroll 4\n
1645 for (unsigned int i = 0; i < 4; i++) {
1646 sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1647 }
1648
1649 }
1650
1651 v.x = (CLQuantum)sv[0];
1652 v.y = (CLQuantum)sv[1];
1653 v.z = (CLQuantum)sv[2];
1654
1655 if (matte!=0)
1656 v.w = (CLQuantum)sv[3];
1657
1658 outputImage[y*imageWidth+x] = v;
1659
1660 }
1661
1662
1663 )
1664
1665
1666
1667 STRINGIFY(
1668
1669 __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1670 , const unsigned int imageWidth, const unsigned int imageHeight
1671 , const int2 offset, const int polarity, const int matte) {
1672
1673 int x = get_global_id(0);
1674 int y = get_global_id(1);
1675
1676 CLPixelType v = inputImage[y*imageWidth+x];
1677
1678 int2 neighbor, clampedNeighbor;
1679
1680 neighbor.y = y + offset.y;
1681 neighbor.x = x + offset.x;
1682 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1683 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1684
1685 CLPixelType r = (clampedNeighbor.x == neighbor.x
1686 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1687 :(CLPixelType)0;
1688
1689
1690 neighbor.y = y - offset.y;
1691 neighbor.x = x - offset.x;
1692 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1693 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1694
1695 CLPixelType s = (clampedNeighbor.x == neighbor.x
1696 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1697 :(CLPixelType)0;
1698
1699
1700 int sv[4];
1701 sv[0] = (int)v.x;
1702 sv[1] = (int)v.y;
1703 sv[2] = (int)v.z;
1704 sv[3] = (int)v.w;
1705
1706 int sr[4];
1707 sr[0] = (int)r.x;
1708 sr[1] = (int)r.y;
1709 sr[2] = (int)r.z;
1710 sr[3] = (int)r.w;
1711
1712 int ss[4];
1713 ss[0] = (int)s.x;
1714 ss[1] = (int)s.y;
1715 ss[2] = (int)s.z;
1716 ss[3] = (int)s.w;
1717
1718 if (polarity > 0) {
1719 \n #pragma unroll 4\n
1720 for (unsigned int i = 0; i < 4; i++) {
1721 //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1722 //
1723 //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1724 //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1725 sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1726 }
1727 }
1728 else {
1729 \n #pragma unroll 4\n
1730 for (unsigned int i = 0; i < 4; i++) {
1731 //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1732 //
1733 //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1734 sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1735 }
1736 }
1737
1738 v.x = (CLQuantum)sv[0];
1739 v.y = (CLQuantum)sv[1];
1740 v.z = (CLQuantum)sv[2];
1741
1742 if (matte!=0)
1743 v.w = (CLQuantum)sv[3];
1744
1745 outputImage[y*imageWidth+x] = v;
1746
1747 }
1748
1749
1750 )
1751
1752/*
1753%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1754% %
1755% %
1756% %
1757% E q u a l i z e %
1758% %
1759% %
1760% %
1761%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1762*/
1763
1764 STRINGIFY(
1765 /*
1766 */
1767 __kernel void Equalize(__global CLPixelType * restrict im,
1768 const ChannelType channel,
1769 __global CLPixelType * restrict equalize_map,
1770 const float4 white, const float4 black)
1771 {
1772 const int x = get_global_id(0);
1773 const int y = get_global_id(1);
1774 const int columns = get_global_size(0);
1775 const int c = x + y * columns;
1776
1777 uint ePos;
1778 CLPixelType oValue, eValue;
1779 CLQuantum red, green, blue, opacity;
1780
1781 //read from global
1782 oValue=im[c];
1783
1784 if ((channel & SyncChannels) != 0)
1785 {
1786 if (getRedF4(white) != getRedF4(black))
1787 {
1788 ePos = ScaleQuantumToMap(getRed(oValue));
1789 eValue = equalize_map[ePos];
1790 red = getRed(eValue);
1791 ePos = ScaleQuantumToMap(getGreen(oValue));
1792 eValue = equalize_map[ePos];
1793 green = getRed(eValue);
1794 ePos = ScaleQuantumToMap(getBlue(oValue));
1795 eValue = equalize_map[ePos];
1796 blue = getRed(eValue);
1797 ePos = ScaleQuantumToMap(getOpacity(oValue));
1798 eValue = equalize_map[ePos];
1799 opacity = getRed(eValue);
1800
1801 //write back
1802 im[c]=(CLPixelType)(blue, green, red, opacity);
1803 }
1804
1805 }
1806
1807 // for equalizing, we always need all channels?
1808 // otherwise something more
1809
1810 }
1811 )
1812
1813/*
1814%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1815% %
1816% %
1817% %
1818% F u n c t i o n %
1819% %
1820% %
1821% %
1822%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1823*/
1824
1825 STRINGIFY(
1826
1827 /*
1828 apply FunctionImageChannel(braightness-contrast)
1829 */
1830 CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
1831 const unsigned int number_parameters,
1832 __constant float *parameters)
1833 {
1834 float4 result = (float4) 0.0f;
1835 switch (function)
1836 {
1837 case PolynomialFunction:
1838 {
1839 for (unsigned int i=0; i < number_parameters; i++)
1840 result = result*(float4)QuantumScale*convert_float4(pixel) + parameters[i];
1841 result *= (float4)QuantumRange;
1842 break;
1843 }
1844 case SinusoidFunction:
1845 {
1846 float freq,phase,ampl,bias;
1847 freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1848 phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1849 ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1850 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1851 result.x = QuantumRange*(ampl*sin(2.0f*MagickPI*
1852 (freq*QuantumScale*(float)pixel.x + phase/360.0f)) + bias);
1853 result.y = QuantumRange*(ampl*sin(2.0f*MagickPI*
1854 (freq*QuantumScale*(float)pixel.y + phase/360.0f)) + bias);
1855 result.z = QuantumRange*(ampl*sin(2.0f*MagickPI*
1856 (freq*QuantumScale*(float)pixel.z + phase/360.0f)) + bias);
1857 result.w = QuantumRange*(ampl*sin(2.0f*MagickPI*
1858 (freq*QuantumScale*(float)pixel.w + phase/360.0f)) + bias);
1859 break;
1860 }
1861 case ArcsinFunction:
1862 {
1863 float width,range,center,bias;
1864 width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1865 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1866 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1867 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1868
1869 result.x = 2.0f/width*(QuantumScale*(float)pixel.x - center);
1870 result.x = range/MagickPI*asin(result.x)+bias;
1871 result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
1872 result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
1873
1874 result.y = 2.0f/width*(QuantumScale*(float)pixel.y - center);
1875 result.y = range/MagickPI*asin(result.y)+bias;
1876 result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
1877 result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
1878
1879 result.z = 2.0f/width*(QuantumScale*(float)pixel.z - center);
1880 result.z = range/MagickPI*asin(result.z)+bias;
1881 result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
1882 result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
1883
1884
1885 result.w = 2.0f/width*(QuantumScale*(float)pixel.w - center);
1886 result.w = range/MagickPI*asin(result.w)+bias;
1887 result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
1888 result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
1889
1890 result *= (float4)QuantumRange;
1891 break;
1892 }
1893 case ArctanFunction:
1894 {
1895 float slope,range,center,bias;
1896 slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1897 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1898 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1899 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1900 result = (float4)MagickPI*(float4)slope*((float4)QuantumScale*convert_float4(pixel)-(float4)center);
1901 result = (float4)QuantumRange*((float4)range/(float4)MagickPI*atan(result) + (float4)bias);
1902 break;
1903 }
1904 case UndefinedFunction:
1905 break;
1906 }
1907 return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
1908 ClampToQuantum(result.z), ClampToQuantum(result.w));
1909 }
1910 )
1911
1912 STRINGIFY(
1913 /*
1914 Improve brightness / contrast of the image
1915 channel : define which channel is improved
1916 function : the function called to enchance the brightness contrast
1917 number_parameters : numbers of parameters
1918 parameters : the parameter
1919 */
1920 __kernel void ComputeFunction(__global CLPixelType *im,
1921 const ChannelType channel, const MagickFunction function,
1922 const unsigned int number_parameters, __constant float *parameters)
1923 {
1924 const int x = get_global_id(0);
1925 const int y = get_global_id(1);
1926 const int columns = get_global_size(0);
1927 const int c = x + y * columns;
1928 im[c] = ApplyFunction(im[c], function, number_parameters, parameters);
1929 }
1930 )
1931
1932/*
1933%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1934% %
1935% %
1936% %
1937% G r a y s c a l e %
1938% %
1939% %
1940% %
1941%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1942*/
1943
1944 STRINGIFY(
1945 __kernel void Grayscale(__global CLPixelType *im,
1946 const int method, const int colorspace)
1947 {
1948
1949 const int x = get_global_id(0);
1950 const int y = get_global_id(1);
1951 const int columns = get_global_size(0);
1952 const int c = x + y * columns;
1953
1954 CLPixelType pixel = im[c];
1955
1956 float
1957 blue,
1958 green,
1959 intensity,
1960 red;
1961
1962 red=(float)getRed(pixel);
1963 green=(float)getGreen(pixel);
1964 blue=(float)getBlue(pixel);
1965
1966 intensity=0.0;
1967
1968 CLPixelType filteredPixel;
1969
1970 switch (method)
1971 {
1972 case AveragePixelIntensityMethod:
1973 {
1974 intensity=(red+green+blue)/3.0;
1975 break;
1976 }
1977 case BrightnessPixelIntensityMethod:
1978 {
1979 intensity=MagickMax(MagickMax(red,green),blue);
1980 break;
1981 }
1982 case LightnessPixelIntensityMethod:
1983 {
1984 intensity=(MagickMin(MagickMin(red,green),blue)+
1985 MagickMax(MagickMax(red,green),blue))/2.0;
1986 break;
1987 }
1988 case MSPixelIntensityMethod:
1989 {
1990 intensity=(float) (((float) red*red+green*green+
1991 blue*blue)/(3.0*QuantumRange));
1992 break;
1993 }
1994 case Rec601LumaPixelIntensityMethod:
1995 {
1996 /*
1997 if (colorspace == RGBColorspace)
1998 {
1999 red=EncodePixelGamma(red);
2000 green=EncodePixelGamma(green);
2001 blue=EncodePixelGamma(blue);
2002 }
2003 */
2004 intensity=0.298839*red+0.586811*green+0.114350*blue;
2005 break;
2006 }
2007 case Rec601LuminancePixelIntensityMethod:
2008 {
2009 /*
2010 if (image->colorspace == sRGBColorspace)
2011 {
2012 red=DecodePixelGamma(red);
2013 green=DecodePixelGamma(green);
2014 blue=DecodePixelGamma(blue);
2015 }
2016 */
2017 intensity=0.298839*red+0.586811*green+0.114350*blue;
2018 break;
2019 }
2020 case Rec709LumaPixelIntensityMethod:
2021 default:
2022 {
2023 /*
2024 if (image->colorspace == RGBColorspace)
2025 {
2026 red=EncodePixelGamma(red);
2027 green=EncodePixelGamma(green);
2028 blue=EncodePixelGamma(blue);
2029 }
2030 */
2031 intensity=0.212656*red+0.715158*green+0.072186*blue;
2032 break;
2033 }
2034 case Rec709LuminancePixelIntensityMethod:
2035 {
2036 /*
2037 if (image->colorspace == sRGBColorspace)
2038 {
2039 red=DecodePixelGamma(red);
2040 green=DecodePixelGamma(green);
2041 blue=DecodePixelGamma(blue);
2042 }
2043 */
2044 intensity=0.212656*red+0.715158*green+0.072186*blue;
2045 break;
2046 }
2047 case RMSPixelIntensityMethod:
2048 {
2049 intensity=(float) (sqrt((float) red*red+green*green+
2050 blue*blue)/sqrt(3.0));
2051 break;
2052 }
2053
2054 }
2055
2056 setGray(&filteredPixel, ClampToQuantum(intensity));
2057
2058 filteredPixel.w = pixel.w;
2059
2060 im[c] = filteredPixel;
2061 }
2062 )
2063
2064/*
2065%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2066% %
2067% %
2068% %
2069% L o c a l C o n t r a s t %
2070% %
2071% %
2072% %
2073%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2074*/
2075
2076 STRINGIFY(
2077 static inline int mirrorBottom(int value)
2078 {
2079 return (value < 0) ? - (value) : value;
2080 }
2081 static inline int mirrorTop(int value, int width)
2082 {
2083 return (value >= width) ? (2 * width - value - 1) : value;
2084 }
2085
2086 __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
2087 const int radius,
2088 const int imageWidth,
2089 const int imageHeight)
2090 {
2091 const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
2092
2093 int x = get_local_id(0);
2094 int y = get_global_id(1);
2095
2096 if ((x >= imageWidth) || (y >= imageHeight))
2097 return;
2098
2099 global CLPixelType *src = srcImage + y * imageWidth;
2100
2101 for (int i = x; i < imageWidth; i += get_local_size(0)) {
2102 float sum = 0.0f;
2103 float weight = 1.0f;
2104
2105 int j = i - radius;
2106 while ((j + 7) < i) {
2107 for (int k = 0; k < 8; ++k) // Unroll 8x
2108 sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
2109 weight += 8.0f;
2110 j+=8;
2111 }
2112 while (j < i) {
2113 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
2114 weight += 1.0f;
2115 ++j;
2116 }
2117
2118 while ((j + 7) < radius + i) {
2119 for (int k = 0; k < 8; ++k) // Unroll 8x
2120 sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
2121 weight -= 8.0f;
2122 j+=8;
2123 }
2124 while (j < radius + i) {
2125 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
2126 weight -= 1.0f;
2127 ++j;
2128 }
2129
2130 tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
2131 }
2132 }
2133 )
2134
2135 STRINGIFY(
2136 __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
2137 const int radius,
2138 const float strength,
2139 const int imageWidth,
2140 const int imageHeight)
2141 {
2142 const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
2143
2144 int x = get_global_id(0);
2145 int y = get_global_id(1);
2146
2147 if ((x >= imageWidth) || (y >= imageHeight))
2148 return;
2149
2150 global float *src = blurImage + x;
2151
2152 float sum = 0.0f;
2153 float weight = 1.0f;
2154
2155 int j = y - radius;
2156 while ((j + 7) < y) {
2157 for (int k = 0; k < 8; ++k) // Unroll 8x
2158 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
2159 weight += 8.0f;
2160 j+=8;
2161 }
2162 while (j < y) {
2163 sum += weight * src[mirrorBottom(j) * imageWidth];
2164 weight += 1.0f;
2165 ++j;
2166 }
2167
2168 while ((j + 7) < radius + y) {
2169 for (int k = 0; k < 8; ++k) // Unroll 8x
2170 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
2171 weight -= 8.0f;
2172 j+=8;
2173 }
2174 while (j < radius + y) {
2175 sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
2176 weight -= 1.0f;
2177 ++j;
2178 }
2179
2180 CLPixelType pixel = srcImage[x + y * imageWidth];
2181 float srcVal = dot(RGB, convert_float4(pixel));
2182 float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
2183 mult = (srcVal + mult) / srcVal;
2184
2185 pixel.x = ClampToQuantum(pixel.x * mult);
2186 pixel.y = ClampToQuantum(pixel.y * mult);
2187 pixel.z = ClampToQuantum(pixel.z * mult);
2188
2189 dstImage[x + y * imageWidth] = pixel;
2190 }
2191 )
2192
2193/*
2194%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2195% %
2196% %
2197% %
2198% M o d u l a t e %
2199% %
2200% %
2201% %
2202%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2203*/
2204
2205 STRINGIFY(
2206
2207 static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
2208 float *hue, float *saturation, float *lightness)
2209 {
2210 float
2211 c,
2212 tmax,
2213 tmin;
2214
2215 /*
2216 Convert RGB to HSL colorspace.
2217 */
2218 tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
2219 tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
2220
2221 c=tmax-tmin;
2222
2223 *lightness=(tmax+tmin)/2.0;
2224 if (c <= 0.0)
2225 {
2226 *hue=0.0;
2227 *saturation=0.0;
2228 return;
2229 }
2230
2231 if (tmax == (QuantumScale*red))
2232 {
2233 *hue=(QuantumScale*green-QuantumScale*blue)/c;
2234 if ((QuantumScale*green) < (QuantumScale*blue))
2235 *hue+=6.0;
2236 }
2237 else
2238 if (tmax == (QuantumScale*green))
2239 *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2240 else
2241 *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2242
2243 *hue*=60.0/360.0;
2244 if (*lightness <= 0.5)
2245 *saturation=c/(2.0*(*lightness));
2246 else
2247 *saturation=c/(2.0-2.0*(*lightness));
2248 }
2249
2250 static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2251 CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2252 {
2253 float
2254 b,
2255 c,
2256 g,
2257 h,
2258 tmin,
2259 r,
2260 x;
2261
2262 /*
2263 Convert HSL to RGB colorspace.
2264 */
2265 h=hue*360.0;
2266 if (lightness <= 0.5)
2267 c=2.0*lightness*saturation;
2268 else
2269 c=(2.0-2.0*lightness)*saturation;
2270 tmin=lightness-0.5*c;
2271 h-=360.0*floor(h/360.0);
2272 h/=60.0;
2273 x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2274 switch ((int) floor(h) % 6)
2275 {
2276 case 0:
2277 default:
2278 {
2279 r=tmin+c;
2280 g=tmin+x;
2281 b=tmin;
2282 break;
2283 }
2284 case 1:
2285 {
2286 r=tmin+x;
2287 g=tmin+c;
2288 b=tmin;
2289 break;
2290 }
2291 case 2:
2292 {
2293 r=tmin;
2294 g=tmin+c;
2295 b=tmin+x;
2296 break;
2297 }
2298 case 3:
2299 {
2300 r=tmin;
2301 g=tmin+x;
2302 b=tmin+c;
2303 break;
2304 }
2305 case 4:
2306 {
2307 r=tmin+x;
2308 g=tmin;
2309 b=tmin+c;
2310 break;
2311 }
2312 case 5:
2313 {
2314 r=tmin+c;
2315 g=tmin;
2316 b=tmin+x;
2317 break;
2318 }
2319 }
2320 *red=ClampToQuantum(QuantumRange*r);
2321 *green=ClampToQuantum(QuantumRange*g);
2322 *blue=ClampToQuantum(QuantumRange*b);
2323 }
2324
2325 static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2326 CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2327 {
2328 float
2329 hue,
2330 lightness,
2331 saturation;
2332
2333 /*
2334 Increase or decrease color lightness, saturation, or hue.
2335 */
2336 ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2337 hue+=0.5*(0.01*percent_hue-1.0);
2338 while (hue < 0.0)
2339 hue+=1.0;
2340 while (hue >= 1.0)
2341 hue-=1.0;
2342 saturation*=0.01*percent_saturation;
2343 lightness*=0.01*percent_lightness;
2344 ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2345 }
2346
2347 __kernel void Modulate(__global CLPixelType *im,
2348 const float percent_brightness,
2349 const float percent_hue,
2350 const float percent_saturation,
2351 const int colorspace)
2352 {
2353
2354 const int x = get_global_id(0);
2355 const int y = get_global_id(1);
2356 const int columns = get_global_size(0);
2357 const int c = x + y * columns;
2358
2359 CLPixelType pixel = im[c];
2360
2361 CLQuantum
2362 blue,
2363 green,
2364 red;
2365
2366 red=getRed(pixel);
2367 green=getGreen(pixel);
2368 blue=getBlue(pixel);
2369
2370 switch (colorspace)
2371 {
2372 case HSLColorspace:
2373 default:
2374 {
2375 ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2376 &red, &green, &blue);
2377 }
2378
2379 }
2380
2381 CLPixelType filteredPixel;
2382
2383 setRed(&filteredPixel, red);
2384 setGreen(&filteredPixel, green);
2385 setBlue(&filteredPixel, blue);
2386 filteredPixel.w = pixel.w;
2387
2388 im[c] = filteredPixel;
2389 }
2390 )
2391
2392/*
2393%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2394% %
2395% %
2396% %
2397% M o t i o n B l u r %
2398% %
2399% %
2400% %
2401%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2402*/
2403
2404 STRINGIFY(
2405 __kernel
2406 void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2407 const unsigned int imageWidth, const unsigned int imageHeight,
2408 const __global float *filter, const unsigned int width, const __global int2* offset,
2409 const float4 bias,
2410 const ChannelType channel, const unsigned int matte) {
2411
2412 int2 currentPixel;
2413 currentPixel.x = get_global_id(0);
2414 currentPixel.y = get_global_id(1);
2415
2416 if (currentPixel.x >= imageWidth
2417 || currentPixel.y >= imageHeight)
2418 return;
2419
2420 float4 pixel;
2421 pixel.x = (float)bias.x;
2422 pixel.y = (float)bias.y;
2423 pixel.z = (float)bias.z;
2424 pixel.w = (float)bias.w;
2425
2426 if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2427
2428 for (int i = 0; i < width; i++) {
2429 // only support EdgeVirtualPixelMethod through ClampToCanvas
2430 // TODO: implement other virtual pixel method
2431 int2 samplePixel = currentPixel + offset[i];
2432 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2433 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2434 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2435
2436 pixel.x += (filter[i] * (float)samplePixelValue.x);
2437 pixel.y += (filter[i] * (float)samplePixelValue.y);
2438 pixel.z += (filter[i] * (float)samplePixelValue.z);
2439 pixel.w += (filter[i] * (float)samplePixelValue.w);
2440 }
2441
2442 CLPixelType outputPixel;
2443 outputPixel.x = ClampToQuantum(pixel.x);
2444 outputPixel.y = ClampToQuantum(pixel.y);
2445 outputPixel.z = ClampToQuantum(pixel.z);
2446 outputPixel.w = ClampToQuantum(pixel.w);
2447 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2448 }
2449 else {
2450
2451 float gamma = 0.0f;
2452 for (int i = 0; i < width; i++) {
2453 // only support EdgeVirtualPixelMethod through ClampToCanvas
2454 // TODO: implement other virtual pixel method
2455 int2 samplePixel = currentPixel + offset[i];
2456 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2457 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2458
2459 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2460
2461 float alpha = QuantumScale*(QuantumRange-samplePixelValue.w);
2462 float k = filter[i];
2463 pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2464 pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2465 pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2466
2467 pixel.w += k * alpha * samplePixelValue.w;
2468
2469 gamma+=k*alpha;
2470 }
2471 gamma = PerceptibleReciprocal(gamma);
2472 pixel.xyz = gamma*pixel.xyz;
2473
2474 CLPixelType outputPixel;
2475 outputPixel.x = ClampToQuantum(pixel.x);
2476 outputPixel.y = ClampToQuantum(pixel.y);
2477 outputPixel.z = ClampToQuantum(pixel.z);
2478 outputPixel.w = ClampToQuantum(pixel.w);
2479 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2480 }
2481 }
2482 )
2483
2484/*
2485%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2486% %
2487% %
2488% %
2489% R a d i a l B l u r %
2490% %
2491% %
2492% %
2493%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2494*/
2495
2496 STRINGIFY(
2497 __kernel void RadialBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
2498 const float4 bias,
2499 const unsigned int channel, const unsigned int matte,
2500 const float2 blurCenter,
2501 __constant float *cos_theta, __constant float *sin_theta,
2502 const unsigned int cossin_theta_size)
2503 {
2504 const int x = get_global_id(0);
2505 const int y = get_global_id(1);
2506 const int columns = get_global_size(0);
2507 const int rows = get_global_size(1);
2508 unsigned int step = 1;
2509 float center_x = (float) x - blurCenter.x;
2510 float center_y = (float) y - blurCenter.y;
2511 float radius = hypot(center_x, center_y);
2512
2513 //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
2514 float blur_radius = hypot(blurCenter.x, blurCenter.y);
2515
2516 if (radius > MagickEpsilon)
2517 {
2518 step = (unsigned int) (blur_radius / radius);
2519 if (step == 0)
2520 step = 1;
2521 if (step >= cossin_theta_size)
2522 step = cossin_theta_size-1;
2523 }
2524
2525 float4 result;
2526 result.x = (float)bias.x;
2527 result.y = (float)bias.y;
2528 result.z = (float)bias.z;
2529 result.w = (float)bias.w;
2530 float normalize = 0.0f;
2531
2532 if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2533 for (unsigned int i=0; i<cossin_theta_size; i+=step)
2534 {
2535 result += convert_float4(im[
2536 ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2537 ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2538 normalize += 1.0f;
2539 }
2540 normalize = PerceptibleReciprocal(normalize);
2541 result = result * normalize;
2542 }
2543 else {
2544 float gamma = 0.0f;
2545 for (unsigned int i=0; i<cossin_theta_size; i+=step)
2546 {
2547 float4 p = convert_float4(im[
2548 ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2549 ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2550
2551 float alpha = (float)(QuantumScale*(QuantumRange-p.w));
2552 result.x += alpha * p.x;
2553 result.y += alpha * p.y;
2554 result.z += alpha * p.z;
2555 result.w += p.w;
2556 gamma+=alpha;
2557 normalize += 1.0f;
2558 }
2559 gamma = PerceptibleReciprocal(gamma);
2560 normalize = PerceptibleReciprocal(normalize);
2561 result.x = gamma*result.x;
2562 result.y = gamma*result.y;
2563 result.z = gamma*result.z;
2564 result.w = normalize*result.w;
2565 }
2566 filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
2567 ClampToQuantum(result.z), ClampToQuantum(result.w));
2568 }
2569 )
2570
2571/*
2572%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2573% %
2574% %
2575% %
2576% R e s i z e %
2577% %
2578% %
2579% %
2580%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2581*/
2582
2583 STRINGIFY(
2584 // Based on Box from resize.c
2585 float BoxResizeFilter(const float x)
2586 {
2587 return 1.0f;
2588 }
2589 )
2590
2591 STRINGIFY(
2592 // Based on CubicBC from resize.c
2593 float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2594 {
2595 /*
2596 Cubic Filters using B,C determined values:
2597 Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
2598 Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
2599 Spline B = 1 C = 0 B-Spline Gaussian approximation
2600 Hermite B = 0 C = 0 B-Spline interpolator
2601
2602 See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2603 Graphics Computer Graphics, Volume 22, Number 4, August 1988
2604 http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2605 Mitchell.pdf.
2606
2607 Coefficients are determined from B,C values:
2608 P0 = ( 6 - 2*B )/6 = coeff[0]
2609 P1 = 0
2610 P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2611 P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2612 Q0 = ( 8*B +24*C )/6 = coeff[3]
2613 Q1 = ( -12*B -48*C )/6 = coeff[4]
2614 Q2 = ( 6*B +30*C )/6 = coeff[5]
2615 Q3 = ( - 1*B - 6*C )/6 = coeff[6]
2616
2617 which are used to define the filter:
2618
2619 P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
2620 Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
2621
2622 which ensures function is continuous in value and derivative (slope).
2623 */
2624 if (x < 1.0)
2625 return(resizeFilterCoefficients[0]+x*(x*
2626 (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2627 if (x < 2.0)
2628 return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2629 (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2630 return(0.0);
2631 }
2632 )
2633
2634 STRINGIFY(
2635 float Sinc(const float x)
2636 {
2637 if (x != 0.0f)
2638 {
2639 const float alpha=(float) (MagickPI*x);
2640 return sinpi(x)/alpha;
2641 }
2642 return(1.0f);
2643 }
2644 )
2645
2646 STRINGIFY(
2647 float Triangle(const float x)
2648 {
2649 /*
2650 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2651 a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
2652 for Sinc().
2653 */
2654 return ((x<1.0f)?(1.0f-x):0.0f);
2655 }
2656 )
2657
2658
2659 STRINGIFY(
2660 float Hanning(const float x)
2661 {
2662 /*
2663 Cosine window function:
2664 0.5+0.5*cos(pi*x).
2665 */
2666 const float cosine=cos((MagickPI*x));
2667 return(0.5f+0.5f*cosine);
2668 }
2669 )
2670
2671 STRINGIFY(
2672 float Hamming(const float x)
2673 {
2674 /*
2675 Offset cosine window function:
2676 .54 + .46 cos(pi x).
2677 */
2678 const float cosine=cos((MagickPI*x));
2679 return(0.54f+0.46f*cosine);
2680 }
2681 )
2682
2683 STRINGIFY(
2684 float Blackman(const float x)
2685 {
2686 /*
2687 Blackman: 2nd order cosine windowing function:
2688 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2689
2690 Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2691 five flops.
2692 */
2693 const float cosine=cos((MagickPI*x));
2694 return(0.34f+cosine*(0.5f+cosine*0.16f));
2695 }
2696 )
2697
2698
2699
2700
2701 STRINGIFY(
2702 static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2703 {
2704 switch (filterType)
2705 {
2706 /* Call Sinc even for SincFast to get better precision on GPU
2707 and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
2708 case SincWeightingFunction:
2709 case SincFastWeightingFunction:
2710 return Sinc(x);
2711 case CubicBCWeightingFunction:
2712 return CubicBC(x,filterCoefficients);
2713 case BoxWeightingFunction:
2714 return BoxResizeFilter(x);
2715 case TriangleWeightingFunction:
2716 return Triangle(x);
2717 case HanningWeightingFunction:
2718 return Hanning(x);
2719 case HammingWeightingFunction:
2720 return Hamming(x);
2721 case BlackmanWeightingFunction:
2722 return Blackman(x);
2723
2724 default:
2725 return 0.0f;
2726 }
2727 }
2728 )
2729
2730
2731 STRINGIFY(
2732 static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2733 , const ResizeWeightingFunctionType resizeWindowType
2734 , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2735 {
2736 float scale;
2737 float xBlur = fabs(x/resizeFilterBlur);
2738 if (resizeWindowSupport < MagickEpsilon
2739 || resizeWindowType == BoxWeightingFunction)
2740 {
2741 scale = 1.0f;
2742 }
2743 else
2744 {
2745 scale = resizeFilterScale;
2746 scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2747 }
2748 float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2749 return weight;
2750 }
2751
2752 )
2753
2754 ;
2755 const char* accelerateKernels2 =
2756
2757 STRINGIFY(
2758
2759 static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2760 return (numWorkItems/pixelPerWorkgroup);
2761 }
2762
2763 // returns the index of the pixel for the current workitem to compute.
2764 // returns -1 if this workitem doesn't need to participate in any computation
2765 static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2766 const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2767 int pixelIndex = itemID/numWorkItemsPerPixel;
2768 pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2769 return pixelIndex;
2770 }
2771
2772 )
2773
2774 STRINGIFY(
2775 __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2776 void ResizeHorizontalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2777 , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2778 , const int resizeFilterType, const int resizeWindowType
2779 , const __global float* resizeFilterCubicCoefficients
2780 , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2781 , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2782 , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2783
2784
2785 // calculate the range of resized image pixels computed by this workgroup
2786 const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2787 const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2788 const unsigned int actualNumPixelToCompute = stopX - startX;
2789
2790 // calculate the range of input image pixels to cache
2791 float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2792 const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2793 scale = PerceptibleReciprocal(scale);
2794
2795 const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2796 const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2797
2798 // cache the input pixels into local memory
2799 const unsigned int y = get_global_id(1);
2800 event_t e = async_work_group_copy(inputImageCache,inputImage+y*inputColumns+cacheRangeStartX,cacheRangeEndX-cacheRangeStartX,0);
2801 wait_group_events(1,&e);
2802
2803 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2804 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2805 {
2806
2807 const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2808 const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2809 const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2810
2811 // determine which resized pixel computed by this workitem
2812 const unsigned int itemID = get_local_id(0);
2813 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2814
2815 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2816
2817 float4 filteredPixel = (float4)0.0f;
2818 float density = 0.0f;
2819 float gamma = 0.0f;
2820 // -1 means this workitem doesn't participate in the computation
2821 if (pixelIndex != -1) {
2822
2823 // x coordinated of the resized pixel computed by this workitem
2824 const int x = chunkStartX + pixelIndex;
2825
2826 // calculate how many steps required for this pixel
2827 const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2828 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2829 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2830 const unsigned int n = stop - start;
2831
2832 // calculate how many steps this workitem will contribute
2833 unsigned int numStepsPerWorkItem = n / numItems;
2834 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2835
2836 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2837 if (startStep < n) {
2838 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2839
2840 unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2841 if (matte == 0) {
2842
2843 for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2844 float4 cp = convert_float4(inputImageCache[cacheIndex]);
2845
2846 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2847 , (ResizeWeightingFunctionType)resizeWindowType
2848 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2849
2850 filteredPixel += ((float4)weight)*cp;
2851 density+=weight;
2852 }
2853
2854
2855 }
2856 else {
2857 for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2858 CLPixelType p = inputImageCache[cacheIndex];
2859
2860 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2861 , (ResizeWeightingFunctionType)resizeWindowType
2862 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2863
2864 float alpha = weight * QuantumScale * GetPixelAlpha(p);
2865 float4 cp = convert_float4(p);
2866
2867 filteredPixel.x += alpha * cp.x;
2868 filteredPixel.y += alpha * cp.y;
2869 filteredPixel.z += alpha * cp.z;
2870 filteredPixel.w += weight * cp.w;
2871
2872 density+=weight;
2873 gamma+=alpha;
2874 }
2875 }
2876 }
2877 }
2878
2879 // initialize the accumulators to zero
2880 if (itemID < actualNumPixelInThisChunk) {
2881 outputPixelCache[itemID] = (float4)0.0f;
2882 densityCache[itemID] = 0.0f;
2883 if (matte != 0)
2884 gammaCache[itemID] = 0.0f;
2885 }
2886 barrier(CLK_LOCAL_MEM_FENCE);
2887
2888 // accumulatte the filtered pixel value and the density
2889 for (unsigned int i = 0; i < numItems; i++) {
2890 if (pixelIndex != -1) {
2891 if (itemID%numItems == i) {
2892 outputPixelCache[pixelIndex]+=filteredPixel;
2893 densityCache[pixelIndex]+=density;
2894 if (matte!=0) {
2895 gammaCache[pixelIndex]+=gamma;
2896 }
2897 }
2898 }
2899 barrier(CLK_LOCAL_MEM_FENCE);
2900 }
2901
2902 if (itemID < actualNumPixelInThisChunk) {
2903 if (matte==0) {
2904 float density = densityCache[itemID];
2905 float4 filteredPixel = outputPixelCache[itemID];
2906 if (density!= 0.0f && density != 1.0)
2907 {
2908 density = PerceptibleReciprocal(density);
2909 filteredPixel *= (float4)density;
2910 }
2911 filteredImage[y*filteredColumns+chunkStartX+itemID] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
2912 , ClampToQuantum(filteredPixel.y)
2913 , ClampToQuantum(filteredPixel.z)
2914 , ClampToQuantum(filteredPixel.w));
2915 }
2916 else {
2917 float density = densityCache[itemID];
2918 float gamma = gammaCache[itemID];
2919 float4 filteredPixel = outputPixelCache[itemID];
2920
2921 if (density!= 0.0f && density != 1.0) {
2922 density = PerceptibleReciprocal(density);
2923 filteredPixel *= (float4)density;
2924 gamma *= density;
2925 }
2926 gamma = PerceptibleReciprocal(gamma);
2927
2928 CLPixelType fp;
2929 fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
2930 , ClampToQuantum(gamma*filteredPixel.y)
2931 , ClampToQuantum(gamma*filteredPixel.z)
2932 , ClampToQuantum(filteredPixel.w));
2933
2934 filteredImage[y*filteredColumns+chunkStartX+itemID] = fp;
2935
2936 }
2937 }
2938
2939 } // end of chunking loop
2940 }
2941 )
2942
2943
2944 STRINGIFY(
2945 __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2946 void ResizeVerticalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2947 , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2948 , const int resizeFilterType, const int resizeWindowType
2949 , const __global float* resizeFilterCubicCoefficients
2950 , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2951 , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2952 , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2953
2954
2955 // calculate the range of resized image pixels computed by this workgroup
2956 const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2957 const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2958 const unsigned int actualNumPixelToCompute = stopY - startY;
2959
2960 // calculate the range of input image pixels to cache
2961 float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2962 const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2963 scale = PerceptibleReciprocal(scale);
2964
2965 const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2966 const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2967
2968 // cache the input pixels into local memory
2969 const unsigned int x = get_global_id(0);
2970 event_t e = async_work_group_strided_copy(inputImageCache, inputImage+cacheRangeStartY*inputColumns+x, cacheRangeEndY-cacheRangeStartY, inputColumns, 0);
2971 wait_group_events(1,&e);
2972
2973 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2974 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2975 {
2976
2977 const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2978 const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2979 const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2980
2981 // determine which resized pixel computed by this workitem
2982 const unsigned int itemID = get_local_id(1);
2983 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2984
2985 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2986
2987 float4 filteredPixel = (float4)0.0f;
2988 float density = 0.0f;
2989 float gamma = 0.0f;
2990 // -1 means this workitem doesn't participate in the computation
2991 if (pixelIndex != -1) {
2992
2993 // x coordinated of the resized pixel computed by this workitem
2994 const int y = chunkStartY + pixelIndex;
2995
2996 // calculate how many steps required for this pixel
2997 const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2998 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2999 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
3000 const unsigned int n = stop - start;
3001
3002 // calculate how many steps this workitem will contribute
3003 unsigned int numStepsPerWorkItem = n / numItems;
3004 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
3005
3006 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
3007 if (startStep < n) {
3008 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
3009
3010 unsigned int cacheIndex = start+startStep-cacheRangeStartY;
3011 if (matte == 0) {
3012
3013 for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3014 float4 cp = convert_float4(inputImageCache[cacheIndex]);
3015
3016 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3017 , (ResizeWeightingFunctionType)resizeWindowType
3018 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3019
3020 filteredPixel += ((float4)weight)*cp;
3021 density+=weight;
3022 }
3023
3024
3025 }
3026 else {
3027 for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3028 CLPixelType p = inputImageCache[cacheIndex];
3029
3030 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3031 , (ResizeWeightingFunctionType)resizeWindowType
3032 , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3033
3034 float alpha = weight * QuantumScale * GetPixelAlpha(p);
3035 float4 cp = convert_float4(p);
3036
3037 filteredPixel.x += alpha * cp.x;
3038 filteredPixel.y += alpha * cp.y;
3039 filteredPixel.z += alpha * cp.z;
3040 filteredPixel.w += weight * cp.w;
3041
3042 density+=weight;
3043 gamma+=alpha;
3044 }
3045 }
3046 }
3047 }
3048
3049 // initialize the accumulators to zero
3050 if (itemID < actualNumPixelInThisChunk) {
3051 outputPixelCache[itemID] = (float4)0.0f;
3052 densityCache[itemID] = 0.0f;
3053 if (matte != 0)
3054 gammaCache[itemID] = 0.0f;
3055 }
3056 barrier(CLK_LOCAL_MEM_FENCE);
3057
3058 // accumulatte the filtered pixel value and the density
3059 for (unsigned int i = 0; i < numItems; i++) {
3060 if (pixelIndex != -1) {
3061 if (itemID%numItems == i) {
3062 outputPixelCache[pixelIndex]+=filteredPixel;
3063 densityCache[pixelIndex]+=density;
3064 if (matte!=0) {
3065 gammaCache[pixelIndex]+=gamma;
3066 }
3067 }
3068 }
3069 barrier(CLK_LOCAL_MEM_FENCE);
3070 }
3071
3072 if (itemID < actualNumPixelInThisChunk) {
3073 if (matte==0) {
3074 float density = densityCache[itemID];
3075 float4 filteredPixel = outputPixelCache[itemID];
3076 if (density!= 0.0f && density != 1.0)
3077 {
3078 density = PerceptibleReciprocal(density);
3079 filteredPixel *= (float4)density;
3080 }
3081 filteredImage[(chunkStartY+itemID)*filteredColumns+x] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
3082 , ClampToQuantum(filteredPixel.y)
3083 , ClampToQuantum(filteredPixel.z)
3084 , ClampToQuantum(filteredPixel.w));
3085 }
3086 else {
3087 float density = densityCache[itemID];
3088 float gamma = gammaCache[itemID];
3089 float4 filteredPixel = outputPixelCache[itemID];
3090
3091 if (density!= 0.0f && density != 1.0) {
3092 density = PerceptibleReciprocal(density);
3093 filteredPixel *= (float4)density;
3094 gamma *= density;
3095 }
3096 gamma = PerceptibleReciprocal(gamma);
3097
3098 CLPixelType fp;
3099 fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
3100 , ClampToQuantum(gamma*filteredPixel.y)
3101 , ClampToQuantum(gamma*filteredPixel.z)
3102 , ClampToQuantum(filteredPixel.w));
3103
3104 filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
3105
3106 }
3107 }
3108
3109 } // end of chunking loop
3110 }
3111 )
3112
3113/*
3114%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3115% %
3116% %
3117% %
3118% U n s h a r p M a s k %
3119% %
3120% %
3121% %
3122%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3123*/
3124
3125 STRINGIFY(
3126 __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage,
3127 const __global float4 *blurRowData, __global CLPixelType *filtered_im,
3128 const unsigned int imageColumns, const unsigned int imageRows,
3129 __local float4* cachedData, __local float* cachedFilter,
3130 const ChannelType channel, const __global float *filter, const unsigned int width,
3131 const float gain, const float threshold)
3132 {
3133 const unsigned int radius = (width-1)/2;
3134
3135 // cache the pixel shared by the workgroup
3136 const int groupX = get_group_id(0);
3137 const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
3138 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
3139
3140 if (groupStartY >= 0
3141 && groupStopY < imageRows) {
3142 event_t e = async_work_group_strided_copy(cachedData
3143 ,blurRowData+groupStartY*imageColumns+groupX
3144 ,groupStopY-groupStartY,imageColumns,0);
3145 wait_group_events(1,&e);
3146 }
3147 else {
3148 for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
3149 cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
3150 }
3151 barrier(CLK_LOCAL_MEM_FENCE);
3152 }
3153 // cache the filter as well
3154 event_t e = async_work_group_copy(cachedFilter,filter,width,0);
3155 wait_group_events(1,&e);
3156
3157 // only do the work if this is not a patched item
3158 //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
3159 const int cy = get_global_id(1);
3160
3161 if (cy < imageRows) {
3162 float4 blurredPixel = (float4) 0.0f;
3163
3164 int i = 0;
3165
3166 \n #ifndef UFACTOR \n
3167 \n #define UFACTOR 8 \n
3168 \n #endif \n
3169
3170 for ( ; i+UFACTOR < width; )
3171 {
3172 \n #pragma unroll UFACTOR \n
3173 for (int j=0; j < UFACTOR; j++, i++)
3174 {
3175 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3176 }
3177 }
3178
3179 for ( ; i < width; i++)
3180 {
3181 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3182 }
3183
3184 blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
3185 ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
3186
3187 float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
3188 float4 outputPixel = inputImagePixel - blurredPixel;
3189
3190 float quantumThreshold = QuantumRange*threshold;
3191
3192 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
3193 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
3194
3195 //write back
3196 filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
3197 ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
3198
3199 }
3200 }
3201 )
3202
3203
3204
3205 STRINGIFY(
3206 __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im,
3207 __constant float *filter,
3208 const unsigned int width,
3209 const unsigned int imageColumns, const unsigned int imageRows,
3210 __local float4 *pixels,
3211 const float gain, const float threshold, const unsigned int justBlur)
3212 {
3213 const int x = get_global_id(0);
3214 const int y = get_global_id(1);
3215
3216 const unsigned int radius = (width - 1) / 2;
3217
3218 int row = y - radius;
3219 int baseRow = get_group_id(1) * get_local_size(1) - radius;
3220 int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
3221
3222 while (row < endRow) {
3223 int srcy = (row < 0) ? -row : row; // mirror pad
3224 srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy;
3225
3226 float4 value = 0.0f;
3227
3228 int ix = x - radius;
3229 int i = 0;
3230
3231 while (i + 7 < width) {
3232 for (int j = 0; j < 8; ++j) { // unrolled
3233 int srcx = ix + j;
3234 srcx = (srcx < 0) ? -srcx : srcx;
3235 srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3236 value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]);
3237 }
3238 ix += 8;
3239 i += 8;
3240 }
3241
3242 while (i < width) {
3243 int srcx = (ix < 0) ? -ix : ix; // mirror pad
3244 srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3245 value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]);
3246 ++i;
3247 ++ix;
3248 }
3249 pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
3250 row += get_local_size(1);
3251 }
3252
3253
3254 barrier(CLK_LOCAL_MEM_FENCE);
3255
3256
3257 const int px = get_local_id(0);
3258 const int py = get_local_id(1);
3259 const int prp = get_local_size(0);
3260 float4 value = (float4)(0.0f);
3261
3262 int i = 0;
3263 while (i + 7 < width) { // unrolled
3264 value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3265 value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp];
3266 value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp];
3267 value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp];
3268 value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp];
3269 value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp];
3270 value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp];
3271 value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp];
3272 i += 8;
3273 }
3274 while (i < width) {
3275 value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3276 ++i;
3277 }
3278 if ((x < imageColumns) && (y < imageRows)) {
3279 if (justBlur == 0) { // apply sharpening
3280 float4 srcPixel = convert_float4(im[x + y * imageColumns]);
3281 float4 diff = srcPixel - value;
3282
3283 float quantumThreshold = QuantumRange*threshold;
3284
3285 int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3286 value = select(srcPixel + diff * gain, srcPixel, mask);
3287 }
3288 filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3));
3289 }
3290 }
3291 )
3292
3293 STRINGIFY(
3294 __kernel __attribute__((reqd_work_group_size(64, 4, 1))) void WaveletDenoise(__global CLPixelType *srcImage, __global CLPixelType *dstImage,
3295 const float threshold,
3296 const int passes,
3297 const int imageWidth,
3298 const int imageHeight)
3299 {
3300 const int pad = (1 << (passes - 1));
3301 const int tileSize = 64;
3302 const int tileRowPixels = 64;
3303 const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3304
3305 CLPixelType stage[16];
3306
3307 local float buffer[64 * 64];
3308
3309 int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3310 int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3311
3312 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3313 stage[i / 4] = srcImage[mirrorTop(mirrorBottom(srcx), imageWidth) + (mirrorTop(mirrorBottom(srcy + i) , imageHeight)) * imageWidth];
3314 }
3315
3316
3317 for (int channel = 0; channel < 3; ++channel) {
3318 // Load LDS
3319 switch (channel) {
3320 case 0:
3321 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3322 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s0);
3323 break;
3324 case 1:
3325 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3326 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s1);
3327 break;
3328 case 2:
3329 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3330 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s2);
3331 break;
3332 }
3333
3334
3335 // Process
3336
3337 float tmp[16];
3338 float accum[16];
3339 float pixel;
3340
3341 for (int pass = 0; pass < passes; ++pass) {
3342 const int radius = 1 << pass;
3343 const int x = get_local_id(0);
3344 const float thresh = threshold * noise[pass];
3345
3346 if (pass == 0)
3347 accum[0] = accum[1] = accum[2] = accum[3] = accum[4] = accum[5] = accum[6] = accum[6] = accum[7] = accum[8] = accum[9] = accum[10] = accum[11] = accum[12] = accum[13] = accum[14] = accum[15] = 0.0f;
3348
3349 // Snapshot input
3350
3351 // Apply horizontal hat
3352 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3353 const int offset = i * tileRowPixels;
3354 if (pass == 0)
3355 tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3356 pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3357 barrier(CLK_LOCAL_MEM_FENCE);
3358 buffer[x + offset] = pixel;
3359 }
3360 barrier(CLK_LOCAL_MEM_FENCE);
3361 // Apply vertical hat
3362 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3363 pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3364 float delta = tmp[i / 4] - pixel;
3365 tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3366 if (delta < -thresh)
3367 delta += thresh;
3368 else if (delta > thresh)
3369 delta -= thresh;
3370 else
3371 delta = 0;
3372 accum[i / 4] += delta;
3373
3374 }
3375 barrier(CLK_LOCAL_MEM_FENCE);
3376 if (pass < passes - 1)
3377 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3378 buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3379 else // last pass
3380 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3381 accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3382 barrier(CLK_LOCAL_MEM_FENCE);
3383 }
3384
3385 switch (channel) {
3386 case 0:
3387 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3388 stage[i / 4].s0 = ClampToQuantum(accum[i / 4]);
3389 break;
3390 case 1:
3391 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3392 stage[i / 4].s1 = ClampToQuantum(accum[i / 4]);
3393 break;
3394 case 2:
3395 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3396 stage[i / 4].s2 = ClampToQuantum(accum[i / 4]);
3397 break;
3398 }
3399
3400 barrier(CLK_LOCAL_MEM_FENCE);
3401 }
3402
3403 // Write from stage to output
3404
3405 if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3406 //for (int i = pad + get_local_id(1); i < tileSize - pad; i += get_local_size(1)) {
3407 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3408 if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3409 dstImage[srcx + (srcy + i) * imageWidth] = stage[i / 4];
3410 }
3411 }
3412 }
3413 }
3414 )
3415 ;
3416
3417#endif // MAGICKCORE_OPENCL_SUPPORT
3418
3419#if defined(__cplusplus) || defined(c_plusplus)
3420}
3421#endif
3422
3423#endif // MAGICKCORE_ACCELERATE_PRIVATE_H