MagickCore  6.9.12-43
Convert, Edit, Or Compose Bitmap Images
accelerate-kernels-private.h
Go to the documentation of this file.
1 /*
2  Copyright 1999-2021 ImageMagick Studio LLC, a non-profit organization
3  dedicated to making software imaging solutions freely available.
4 
5  You may not use this file except in compliance with the License. You may
6  obtain a copy of the License at
7 
8  https://imagemagick.org/script/license.php
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  MagickCore private methods for accelerated functions.
17 */
18 
19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21 
22 #if defined(__cplusplus) || defined(c_plusplus)
23 extern "C" {
24 #endif
25 
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
27 
28 /*
29  Define declarations.
30 */
31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE() "\n #""else " " \n"
34 #define OPENCL_ENDIF() "\n #""endif " " \n"
35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
37 
38 const char* accelerateKernels =
39 
40 /*
41  Define declarations.
42 */
43  OPENCL_DEFINE(GetPixelAlpha(pixel),(QuantumRange-(pixel).w))
44  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
45  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
46  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
47  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
48  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
49  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
50  OPENCL_DEFINE(SigmaRandom, (attenuate))
51  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
52  OPENCL_DEFINE(MagickMax(x, y), (((x) > (y)) ? (x) : (y)))
53  OPENCL_DEFINE(MagickMin(x, y), (((x) < (y)) ? (x) : (y)))
54 
55 /*
56  Typedef declarations.
57 */
58  STRINGIFY(
59  typedef enum
60  {
62  RGBColorspace, /* Linear RGB colorspace */
63  GRAYColorspace, /* greyscale (non-linear) image (faked 1 channel) */
73  CMYKColorspace, /* negared linear RGB with black separated */
74  sRGBColorspace, /* Default: non-lienar sRGB colorspace */
83  CMYColorspace, /* negated linear RGB colorspace */
86  LCHColorspace, /* alias for LCHuv */
88  LCHabColorspace, /* Cylindrical (Polar) Lab */
89  LCHuvColorspace, /* Cylindrical (Polar) Luv */
92  HSVColorspace, /* alias for HSB */
95  LinearGRAYColorspace /* greyscale (linear) image (faked 1 channel) */
97  )
98 
99  STRINGIFY(
100  typedef enum
101  {
157  /* These are new operators, added after the above was last sorted.
158  * The list should be re-sorted only when a new library version is
159  * created.
160  */
175  )
176 
177  STRINGIFY(
178  typedef enum
179  {
185  } MagickFunction;
186  )
187 
188  STRINGIFY(
189  typedef enum
190  {
192  UniformNoise,
195  ImpulseNoise,
197  PoissonNoise,
199  } NoiseType;
200  )
201 
202  STRINGIFY(
203  typedef enum
204  {
216  )
217 
218  STRINGIFY(
219  typedef enum {
237  )
238 
239  STRINGIFY(
240  typedef enum
241  {
243  RedChannel = 0x0001,
244  GrayChannel = 0x0001,
245  CyanChannel = 0x0001,
246  GreenChannel = 0x0002,
247  MagentaChannel = 0x0002,
248  BlueChannel = 0x0004,
249  YellowChannel = 0x0004,
250  AlphaChannel = 0x0008,
251  OpacityChannel = 0x0008,
252  MatteChannel = 0x0008, /* deprecated */
253  BlackChannel = 0x0020,
254  IndexChannel = 0x0020,
255  CompositeChannels = 0x002F,
256  AllChannels = 0x7ffffff,
257  /*
258  Special purpose channel types.
259  */
260  TrueAlphaChannel = 0x0040, /* extract actual alpha channel from opacity */
261  RGBChannels = 0x0080, /* set alpha from grayscale mask in RGB */
262  GrayChannels = 0x0080,
263  SyncChannels = 0x0100, /* channels should be modified equally */
265  } ChannelType;
266  )
267 
268 /*
269  Helper functions.
270 */
271 
272 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
273 
274  STRINGIFY(
275  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
276  {
277  return((CLQuantum) value);
278  }
279  )
280 
281 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
282 
283  STRINGIFY(
284  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
285  {
286  return((CLQuantum) (257.0f*value));
287  }
288  )
289 
290 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
291 
292  STRINGIFY(
293  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
294  {
295  return((CLQuantum) (16843009.0*value));
296  }
297  )
298 
299 OPENCL_ENDIF()
300 
301 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
302 
303  STRINGIFY(
304  static inline CLQuantum ClampToQuantum(const float value)
305  {
306  return (CLQuantum) value;
307  }
308  )
309 
310 OPENCL_ELSE()
311 
312  STRINGIFY(
313  static inline CLQuantum ClampToQuantum(const float value)
314  {
315  return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
316  }
317  )
318 
319 OPENCL_ENDIF()
320 
321  STRINGIFY(
322  static inline int ClampToCanvas(const int offset, const int range)
323  {
324  return clamp(offset, (int)0, range - 1);
325  }
326  )
327 
328  STRINGIFY(
329  static inline int ClampToCanvasWithHalo(const int offset, const int range, const int edge, const int section)
330  {
331  return clamp(offset, section ? (int)(0 - edge) : (int)0, section ? (range - 1) : (range - 1 + edge));
332  }
333  )
334 
335  STRINGIFY(
336  static inline uint ScaleQuantumToMap(CLQuantum value)
337  {
338  if (value >= (CLQuantum)MaxMap)
339  return ((uint)MaxMap);
340  else
341  return ((uint)value);
342  }
343  )
344 
345  STRINGIFY(
346  static inline float PerceptibleReciprocal(const float x)
347  {
348  float sign = x < (float) 0.0 ? (float)-1.0 : (float) 1.0;
349  return((sign*x) >= MagickEpsilon ? (float) 1.0 / x : sign*((float) 1.0 / MagickEpsilon));
350  }
351  )
352 
353  STRINGIFY(
354  static inline float RoundToUnity(const float value)
355  {
356  return clamp(value, 0.0f, 1.0f);
357  }
358  )
359 
360  STRINGIFY(
361 
362  static inline CLQuantum getBlue(CLPixelType p) { return p.x; }
363  static inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
364  static inline float getBlueF4(float4 p) { return p.x; }
365  static inline void setBlueF4(float4* p, float value) { (*p).x = value; }
366 
367  static inline CLQuantum getGreen(CLPixelType p) { return p.y; }
368  static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
369  static inline float getGreenF4(float4 p) { return p.y; }
370  static inline void setGreenF4(float4* p, float value) { (*p).y = value; }
371 
372  static inline CLQuantum getRed(CLPixelType p) { return p.z; }
373  static inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
374  static inline float getRedF4(float4 p) { return p.z; }
375  static inline void setRedF4(float4* p, float value) { (*p).z = value; }
376 
377  static inline CLQuantum getOpacity(CLPixelType p) { return p.w; }
378  static inline void setOpacity(CLPixelType* p, CLQuantum value) { (*p).w = value; }
379  static inline float getOpacityF4(float4 p) { return p.w; }
380  static inline void setOpacityF4(float4* p, float value) { (*p).w = value; }
381 
382  static inline void setGray(CLPixelType* p, CLQuantum value) { (*p).z = value; (*p).y = value; (*p).x = value; }
383 
384  static inline float GetPixelIntensity(const int method, const int colorspace, CLPixelType p)
385  {
386  float red = getRed(p);
387  float green = getGreen(p);
388  float blue = getBlue(p);
389 
390  float intensity;
391 
392  if (colorspace == GRAYColorspace)
393  return red;
394 
395  switch (method)
396  {
398  {
399  intensity = (red + green + blue) / 3.0;
400  break;
401  }
403  {
404  intensity = MagickMax(MagickMax(red, green), blue);
405  break;
406  }
408  {
409  intensity = (MagickMin(MagickMin(red, green), blue) +
410  MagickMax(MagickMax(red, green), blue)) / 2.0;
411  break;
412  }
414  {
415  intensity = (float)(((float)red*red + green*green + blue*blue) /
416  (3.0*QuantumRange));
417  break;
418  }
420  {
421  /*
422  if (image->colorspace == RGBColorspace)
423  {
424  red=EncodePixelGamma(red);
425  green=EncodePixelGamma(green);
426  blue=EncodePixelGamma(blue);
427  }
428  */
429  intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
430  break;
431  }
433  {
434  /*
435  if (image->colorspace == sRGBColorspace)
436  {
437  red=DecodePixelGamma(red);
438  green=DecodePixelGamma(green);
439  blue=DecodePixelGamma(blue);
440  }
441  */
442  intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
443  break;
444  }
446  default:
447  {
448  /*
449  if (image->colorspace == RGBColorspace)
450  {
451  red=EncodePixelGamma(red);
452  green=EncodePixelGamma(green);
453  blue=EncodePixelGamma(blue);
454  }
455  */
456  intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
457  break;
458  }
460  {
461  /*
462  if (image->colorspace == sRGBColorspace)
463  {
464  red=DecodePixelGamma(red);
465  green=DecodePixelGamma(green);
466  blue=DecodePixelGamma(blue);
467  }
468  */
469  intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
470  break;
471  }
473  {
474  intensity = (float)(sqrt((float)red*red + green*green + blue*blue) /
475  sqrt(3.0));
476  break;
477  }
478  }
479 
480  return intensity;
481 
482  }
483  )
484 
485 /*
486 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
487 % %
488 % %
489 % %
490 % A d d N o i s e %
491 % %
492 % %
493 % %
494 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
495 */
496 
497 STRINGIFY(
498 
499 /*
500 Part of MWC64X by David Thomas, dt10@imperial.ac.uk
501 This is provided under BSD, full license is with the main package.
502 See http://www.doc.ic.ac.uk/~dt10/research
503 */
504 
505 // Pre: a<M, b<M
506 // Post: r=(a+b) mod M
507 ulong MWC_AddMod64(ulong a, ulong b, ulong M)
508 {
509  ulong v=a+b;
510  //if( (v>=M) || (v<a) )
511  if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
512  v=v-M;
513  return v;
514 }
515 
516 // Pre: a<M,b<M
517 // Post: r=(a*b) mod M
518 // This could be done more efficently, but it is portable, and should
519 // be easy to understand. It can be replaced with any of the better
520 // modular multiplication algorithms (for example if you know you have
521 // double precision available or something).
522 ulong MWC_MulMod64(ulong a, ulong b, ulong M)
523 {
524  ulong r=0;
525  while(a!=0){
526  if(a&1)
527  r=MWC_AddMod64(r,b,M);
528  b=MWC_AddMod64(b,b,M);
529  a=a>>1;
530  }
531  return r;
532 }
533 
534 
535 // Pre: a<M, e>=0
536 // Post: r=(a^b) mod M
537 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
538 // most architectures
539 ulong MWC_PowMod64(ulong a, ulong e, ulong M)
540 {
541  ulong sqr=a, acc=1;
542  while(e!=0){
543  if(e&1)
544  acc=MWC_MulMod64(acc,sqr,M);
545  sqr=MWC_MulMod64(sqr,sqr,M);
546  e=e>>1;
547  }
548  return acc;
549 }
550 
551 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
552 {
553  ulong m=MWC_PowMod64(A, distance, M);
554  ulong x=curr.x*(ulong)A+curr.y;
555  x=MWC_MulMod64(x, m, M);
556  return (uint2)((uint)(x/A), (uint)(x%A));
557 }
558 
559 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
560 {
561  // This is an arbitrary constant for starting LCG jumping from. I didn't
562  // want to start from 1, as then you end up with the two or three first values
563  // being a bit poor in ones - once you've decided that, one constant is as
564  // good as any another. There is no deep mathematical reason for it, I just
565  // generated a random number.
566  enum{ MWC_BASEID = 4077358422479273989UL };
567 
568  ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
569  ulong m=MWC_PowMod64(A, dist, M);
570 
571  ulong x=MWC_MulMod64(MWC_BASEID, m, M);
572  return (uint2)((uint)(x/A), (uint)(x%A));
573 }
574 
576 typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
577 
578 void MWC64X_Step(mwc64x_state_t *s)
579 {
580  uint X=s->x, C=s->c;
581 
582  uint Xn=s->seed0*X+C;
583  uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
584  uint Cn=mad_hi(s->seed0,X,carry);
585 
586  s->x=Xn;
587  s->c=Cn;
588 }
589 
590 void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
591 {
592  uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
593  s->x=tmp.x;
594  s->c=tmp.y;
595 }
596 
597 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
598 {
599  uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
600  s->x=tmp.x;
601  s->c=tmp.y;
602 }
603 
605 uint MWC64X_NextUint(mwc64x_state_t *s)
606 {
607  uint res=s->x ^ s->c;
608  MWC64X_Step(s);
609  return res;
610 }
611 
612 //
613 // End of MWC64X excerpt
614 //
615 
616  float mwcReadPseudoRandomValue(mwc64x_state_t* rng) {
617  return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
618  }
619 
620 
621  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
622 
623  float
624  alpha,
625  beta,
626  noise,
627  sigma;
628 
629  noise = 0.0f;
630  alpha=mwcReadPseudoRandomValue(r);
631  switch(noise_type) {
632  case UniformNoise:
633  default:
634  {
635  noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
636  break;
637  }
638  case GaussianNoise:
639  {
640  float
641  gamma,
642  tau;
643 
644  if (alpha == 0.0f)
645  alpha=1.0f;
646  beta=mwcReadPseudoRandomValue(r);
647  gamma=sqrt(-2.0f*log(alpha));
648  sigma=gamma*cospi((2.0f*beta));
649  tau=gamma*sinpi((2.0f*beta));
650  noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
652  break;
653  }
654 
655 
656  case ImpulseNoise:
657  {
658  if (alpha < (SigmaImpulse/2.0f))
659  noise=0.0f;
660  else
661  if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
662  noise=(float)QuantumRange;
663  else
664  noise=(float)pixel;
665  break;
666  }
667  case LaplacianNoise:
668  {
669  if (alpha <= 0.5f)
670  {
671  if (alpha <= MagickEpsilon)
672  noise=(float) (pixel-QuantumRange);
673  else
674  noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
675  0.5f);
676  break;
677  }
678  beta=1.0f-alpha;
679  if (beta <= (0.5f*MagickEpsilon))
680  noise=(float) (pixel+QuantumRange);
681  else
682  noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
683  break;
684  }
686  {
687  sigma=1.0f;
688  if (alpha > MagickEpsilon)
689  sigma=sqrt(-2.0f*log(alpha));
690  beta=mwcReadPseudoRandomValue(r);
691  noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
692  cospi((float) (2.0f*beta))/2.0f);
693  break;
694  }
695  case PoissonNoise:
696  {
697  float
698  poisson;
699  unsigned int i;
700  poisson=exp(-SigmaPoisson*QuantumScale*pixel);
701  for (i=0; alpha > poisson; i++)
702  {
703  beta=mwcReadPseudoRandomValue(r);
704  alpha*=beta;
705  }
707  break;
708  }
709  case RandomNoise:
710  {
711  noise=(float) (QuantumRange*SigmaRandom*alpha);
712  break;
713  }
714 
715  };
716  return noise;
717  }
718 
719  __kernel
720  void AddNoise(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
721  ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem
722  ,const ChannelType channel
723  ,const NoiseType noise_type, const float attenuate
724  ,const unsigned int seed0, const unsigned int seed1
725  ,const unsigned int numRandomNumbersPerPixel) {
726 
727  mwc64x_state_t rng;
728  rng.seed0 = seed0;
729  rng.seed1 = seed1;
730 
731  uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
732  uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
733 
734  MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
735 
736  uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0); // pixel to process
737 
738  uint count = pixelsPerWorkItem;
739 
740  while (count > 0) {
741  if (pos < inputPixelCount) {
742  CLPixelType p = inputImage[pos];
743 
744  if ((channel&RedChannel)!=0) {
745  setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate)));
746  }
747 
748  if ((channel&GreenChannel)!=0) {
749  setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate)));
750  }
751 
752  if ((channel&BlueChannel)!=0) {
753  setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate)));
754  }
755 
756  if ((channel & OpacityChannel) != 0) {
757  setOpacity(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getOpacity(p),noise_type,attenuate)));
758  }
759 
760  filteredImage[pos] = p;
761  //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255);
762  }
763  pos += get_local_size(0);
764  --count;
765  }
766  }
767  )
768 
769 /*
770 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
771 % %
772 % %
773 % %
774 % B l u r %
775 % %
776 % %
777 % %
778 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
779 */
780 
781  STRINGIFY(
782  /*
783  Reduce image noise and reduce detail levels by row
784  im: input pixels filtered_in filtered_im: output pixels
785  filter : convolve kernel width: convolve kernel size
786  channel : define which channel is blured
787  is_RGBA_BGRA : define the input is RGBA or BGRA
788  */
789  __kernel void BlurRow(__global CLPixelType *im, __global float4 *filtered_im,
790  const ChannelType channel, __constant float *filter,
791  const unsigned int width,
792  const unsigned int imageColumns, const unsigned int imageRows,
793  __local CLPixelType *temp)
794  {
795  const int x = get_global_id(0);
796  const int y = get_global_id(1);
797 
798  const int columns = imageColumns;
799 
800  const unsigned int radius = (width-1)/2;
801  const int wsize = get_local_size(0);
802  const unsigned int loadSize = wsize+width;
803 
804  //load chunk only for now
805  //event_t e = async_work_group_copy(temp+radius, im+x+y*columns, wsize, 0);
806  //wait_group_events(1,&e);
807 
808  //parallel load and clamp
809  /*
810  int count = 0;
811  for (int i=0; i < loadSize; i=i+wsize)
812  {
813  int currentX = x + wsize*(count++);
814 
815  int localId = get_local_id(0);
816 
817  if ((localId+i) > loadSize)
818  break;
819 
820  temp[localId+i] = im[y*columns+ClampToCanvas(currentX-radius, columns)];
821 
822  if (y==0 && get_group_id(0) == 0)
823  {
824  printf("(%d %d) temp %d load %d currentX %d\n", x, y, localId+i, ClampToCanvas(currentX-radius, columns), currentX);
825  }
826  }
827  */
828 
829  //group coordinate
830  const int groupX=get_local_size(0)*get_group_id(0);
831  const int groupY=get_local_size(1)*get_group_id(1);
832 
833  //parallel load and clamp
834  for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
835  {
836  //int cx = ClampToCanvas(groupX+i, columns);
837  temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
838 
839  /*if (0 && y==0 && get_group_id(1) == 0)
840  {
841  printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
842  }*/
843  }
844 
845  // barrier
846  barrier(CLK_LOCAL_MEM_FENCE);
847 
848  // only do the work if this is not a patched item
849  if (get_global_id(0) < columns)
850  {
851  // compute
852  float4 result = (float4) 0;
853 
854  int i = 0;
855 
856  \n #ifndef UFACTOR \n
857  \n #define UFACTOR 8 \n
858  \n #endif \n
859 
860  for ( ; i+UFACTOR < width; )
861  {
862  \n #pragma unroll UFACTOR\n
863  for (int j=0; j < UFACTOR; j++, i++)
864  {
865  result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
866  }
867  }
868 
869  for ( ; i < width; i++)
870  {
871  result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
872  }
873 
874  result.x = ClampToQuantum(result.x);
875  result.y = ClampToQuantum(result.y);
876  result.z = ClampToQuantum(result.z);
877  result.w = ClampToQuantum(result.w);
878 
879  // write back to global
880  filtered_im[y*columns+x] = result;
881  }
882  }
883  )
884 
885  STRINGIFY(
886  /*
887  Reduce image noise and reduce detail levels by line
888  im: input pixels filtered_in filtered_im: output pixels
889  filter : convolve kernel width: convolve kernel size
890  channel : define which channel is blured\
891  is_RGBA_BGRA : define the input is RGBA or BGRA
892  */
893  __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
894  const ChannelType channel, __constant float *filter,
895  const unsigned int width,
896  const unsigned int imageColumns, const unsigned int imageRows,
897  __local float4 *temp)
898  {
899  const int x = get_global_id(0);
900  const int y = get_global_id(1);
901 
902  //const int columns = get_global_size(0);
903  //const int rows = get_global_size(1);
904  const int columns = imageColumns;
905  const int rows = imageRows;
906 
907  unsigned int radius = (width-1)/2;
908  const int wsize = get_local_size(1);
909  const unsigned int loadSize = wsize+width;
910 
911  //group coordinate
912  const int groupX=get_local_size(0)*get_group_id(0);
913  const int groupY=get_local_size(1)*get_group_id(1);
914  //notice that get_local_size(0) is 1, so
915  //groupX=get_group_id(0);
916 
917  //parallel load and clamp
918  for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
919  {
920  temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
921  }
922 
923  // barrier
924  barrier(CLK_LOCAL_MEM_FENCE);
925 
926  // only do the work if this is not a patched item
927  if (get_global_id(1) < rows)
928  {
929  // compute
930  float4 result = (float4) 0;
931 
932  int i = 0;
933 
934  \n #ifndef UFACTOR \n
935  \n #define UFACTOR 8 \n
936  \n #endif \n
937 
938  for ( ; i+UFACTOR < width; )
939  {
940  \n #pragma unroll UFACTOR \n
941  for (int j=0; j < UFACTOR; j++, i++)
942  {
943  result+=filter[i]*temp[i+get_local_id(1)];
944  }
945  }
946 
947  for ( ; i < width; i++)
948  {
949  result+=filter[i]*temp[i+get_local_id(1)];
950  }
951 
952  result.x = ClampToQuantum(result.x);
953  result.y = ClampToQuantum(result.y);
954  result.z = ClampToQuantum(result.z);
955  result.w = ClampToQuantum(result.w);
956 
957  // write back to global
958  filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
959  }
960 
961  }
962  )
963 
964 /*
965 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
966 % %
967 % %
968 % %
969 % C o m p o s i t e %
970 % %
971 % %
972 % %
973 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
974 */
975 
976  STRINGIFY(
977  static inline float ColorDodge(const float Sca,
978  const float Sa,const float Dca,const float Da)
979  {
980  /*
981  Oct 2004 SVG specification.
982  */
983  if ((Sca*Da+Dca*Sa) >= Sa*Da)
984  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
985  return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
986 
987 
988  /*
989  New specification, March 2009 SVG specification. This specification was
990  also wrong of non-overlap cases.
991  */
992  /*
993  if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
994  return(Sca*(1.0-Da));
995  if (fabs(Sca-Sa) < MagickEpsilon)
996  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
997  return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
998  */
999 
1000  /*
1001  Working from first principles using the original formula:
1002 
1003  f(Sc,Dc) = Dc/(1-Sc)
1004 
1005  This works correctly! Looks like the 2004 model was right but just
1006  required a extra condition for correct handling.
1007  */
1008 
1009  /*
1010  if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
1011  return(Sca*(1.0-Da)+Dca*(1.0-Sa));
1012  if (fabs(Sca-Sa) < MagickEpsilon)
1013  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
1014  return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
1015  */
1016  }
1017 
1018  static inline void CompositeColorDodge(const float4 *p,
1019  const float4 *q,float4 *composite) {
1020 
1021  float
1022  Da,
1023  gamma,
1024  Sa;
1025 
1026  Sa=1.0f-QuantumScale*getOpacityF4(*p); /* simplify and speed up equations */
1027  Da=1.0f-QuantumScale*getOpacityF4(*q);
1028  gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
1029  setOpacityF4(composite, QuantumRange*(1.0-gamma));
1030  gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
1031  setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
1032  getRedF4(*q)*Da,Da));
1033  setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
1034  getGreenF4(*q)*Da,Da));
1035  setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
1036  getBlueF4(*q)*Da,Da));
1037  }
1038  )
1039 
1040  STRINGIFY(
1041  static inline void MagickPixelCompositePlus(const float4 *p,
1042  const float alpha,const float4 *q,
1043  const float beta,float4 *composite)
1044  {
1045  float
1046  gamma;
1047 
1048  float
1049  Da,
1050  Sa;
1051  /*
1052  Add two pixels with the given opacities.
1053  */
1054  Sa=1.0-QuantumScale*alpha;
1055  Da=1.0-QuantumScale*beta;
1056  gamma=RoundToUnity(Sa+Da); /* 'Plus' blending -- not 'Over' blending */
1057  setOpacityF4(composite,(float) QuantumRange*(1.0-gamma));
1058  gamma=PerceptibleReciprocal(gamma);
1059  setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
1060  setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
1061  setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
1062  }
1063  )
1064 
1065  STRINGIFY(
1066  static inline void MagickPixelCompositeBlend(const float4 *p,
1067  const float alpha,const float4 *q,
1068  const float beta,float4 *composite)
1069  {
1070  MagickPixelCompositePlus(p,(float) (QuantumRange-alpha*
1071  (QuantumRange-getOpacityF4(*p))),q,(float) (QuantumRange-beta*
1072  (QuantumRange-getOpacityF4(*q))),composite);
1073  }
1074  )
1075 
1076  STRINGIFY(
1077  __kernel
1078  void Composite(__global CLPixelType *image,
1079  const unsigned int imageWidth,
1080  const unsigned int imageHeight,
1081  const unsigned int imageMatte,
1082  const __global CLPixelType *compositeImage,
1083  const unsigned int compositeWidth,
1084  const unsigned int compositeHeight,
1085  const unsigned int compositeMatte,
1086  const unsigned int compose,
1087  const ChannelType channel,
1088  const float destination_dissolve,
1089  const float source_dissolve) {
1090 
1091  uint2 index;
1092  index.x = get_global_id(0);
1093  index.y = get_global_id(1);
1094 
1095 
1096  if (index.x >= imageWidth
1097  || index.y >= imageHeight) {
1098  return;
1099  }
1100  const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
1101  float4 destination;
1102  setRedF4(&destination,getRed(inputPixel));
1103  setGreenF4(&destination,getGreen(inputPixel));
1104  setBlueF4(&destination,getBlue(inputPixel));
1105 
1106 
1107  const CLPixelType compositePixel
1108  = compositeImage[index.y*imageWidth+index.x];
1109  float4 source;
1110  setRedF4(&source,getRed(compositePixel));
1111  setGreenF4(&source,getGreen(compositePixel));
1112  setBlueF4(&source,getBlue(compositePixel));
1113 
1114  if (imageMatte != 0) {
1115  setOpacityF4(&destination,getOpacity(inputPixel));
1116  }
1117  else {
1118  setOpacityF4(&destination,0.0f);
1119  }
1120 
1121  if (compositeMatte != 0) {
1122  setOpacityF4(&source,getOpacity(compositePixel));
1123  }
1124  else {
1125  setOpacityF4(&source,0.0f);
1126  }
1127 
1128  float4 composite=destination;
1129 
1130  CompositeOperator op = (CompositeOperator)compose;
1131  switch (op) {
1132  case ColorDodgeCompositeOp:
1133  CompositeColorDodge(&source,&destination,&composite);
1134  break;
1135  case BlendCompositeOp:
1136  MagickPixelCompositeBlend(&source,source_dissolve,&destination,
1137  destination_dissolve,&composite);
1138  break;
1139  default:
1140  // unsupported operators
1141  break;
1142  };
1143 
1144  CLPixelType outputPixel;
1145  setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
1146  setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
1147  setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
1148  setOpacity(&outputPixel, ClampToQuantum(getOpacityF4(composite)));
1149  image[index.y*imageWidth+index.x] = outputPixel;
1150  }
1151  )
1152 
1153 /*
1154 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1155 % %
1156 % %
1157 % %
1158 % C o n t r a s t %
1159 % %
1160 % %
1161 % %
1162 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1163 */
1164 
1165  STRINGIFY(
1166 
1167  static inline float3 ConvertRGBToHSB(CLPixelType pixel) {
1168  float3 HueSaturationBrightness;
1169  HueSaturationBrightness.x = 0.0f; // Hue
1170  HueSaturationBrightness.y = 0.0f; // Saturation
1171  HueSaturationBrightness.z = 0.0f; // Brightness
1172 
1173  float r=(float) getRed(pixel);
1174  float g=(float) getGreen(pixel);
1175  float b=(float) getBlue(pixel);
1176 
1177  float tmin=MagickMin(MagickMin(r,g),b);
1178  float tmax= MagickMax(MagickMax(r,g),b);
1179 
1180  if (tmax!=0.0f) {
1181  float delta=tmax-tmin;
1182  HueSaturationBrightness.y=delta/tmax;
1183  HueSaturationBrightness.z=QuantumScale*tmax;
1184 
1185  if (delta != 0.0f) {
1186  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1187  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1188  HueSaturationBrightness.x/=6.0f;
1189  HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1190  }
1191  }
1192  return HueSaturationBrightness;
1193  }
1194 
1195  static inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1196 
1197  float hue = HueSaturationBrightness.x;
1198  float brightness = HueSaturationBrightness.z;
1199  float saturation = HueSaturationBrightness.y;
1200 
1201  CLPixelType rgb;
1202 
1203  if (saturation == 0.0f) {
1204  setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1205  setGreen(&rgb,getRed(rgb));
1206  setBlue(&rgb,getRed(rgb));
1207  }
1208  else {
1209 
1210  float h=6.0f*(hue-floor(hue));
1211  float f=h-floor(h);
1212  float p=brightness*(1.0f-saturation);
1213  float q=brightness*(1.0f-saturation*f);
1214  float t=brightness*(1.0f-(saturation*(1.0f-f)));
1215 
1216  float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1217  float clamped_t = ClampToQuantum(QuantumRange*t);
1218  float clamped_p = ClampToQuantum(QuantumRange*p);
1219  float clamped_q = ClampToQuantum(QuantumRange*q);
1220  int ih = (int)h;
1221  setRed(&rgb, (ih == 1)?clamped_q:
1222  (ih == 2 || ih == 3)?clamped_p:
1223  (ih == 4)?clamped_t:
1224  clampedBrightness);
1225 
1226  setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1227  (ih == 3)?clamped_q:
1228  (ih == 4 || ih == 5)?clamped_p:
1229  clamped_t);
1230 
1231  setBlue(&rgb, (ih == 2)?clamped_t:
1232  (ih == 3 || ih == 4)?clampedBrightness:
1233  (ih == 5)?clamped_q:
1234  clamped_p);
1235  }
1236  return rgb;
1237  }
1238 
1239  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1240  {
1241 
1242  const int sign = sharpen!=0?1:-1;
1243  const int x = get_global_id(0);
1244  const int y = get_global_id(1);
1245  const int columns = get_global_size(0);
1246  const int c = x + y * columns;
1247 
1248  CLPixelType pixel = im[c];
1249  float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1250  float brightness = HueSaturationBrightness.z;
1251  brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1252  brightness = clamp(brightness,0.0f,1.0f);
1253  HueSaturationBrightness.z = brightness;
1254 
1255  CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1256  filteredPixel.w = pixel.w;
1257  im[c] = filteredPixel;
1258  }
1259  )
1260 
1261 /*
1262 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1263 % %
1264 % %
1265 % %
1266 % C o n t r a s t S t r e t c h %
1267 % %
1268 % %
1269 % %
1270 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1271 */
1272 
1273  STRINGIFY(
1274  /*
1275  */
1276  __kernel void Histogram(__global CLPixelType * restrict im,
1277  const ChannelType channel,
1278  const int method,
1279  const int colorspace,
1280  __global uint4 * restrict histogram)
1281  {
1282  const int x = get_global_id(0);
1283  const int y = get_global_id(1);
1284  const int columns = get_global_size(0);
1285  const int c = x + y * columns;
1286  if ((channel & SyncChannels) != 0)
1287  {
1288  float intensity = GetPixelIntensity(method, colorspace,im[c]);
1289  uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1290  atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1291  }
1292  else
1293  {
1294  // for equalizing, we always need all channels?
1295  // otherwise something more
1296  }
1297  }
1298  )
1299 
1300  STRINGIFY(
1301  /*
1302  */
1303  __kernel void ContrastStretch(__global CLPixelType * restrict im,
1304  const ChannelType channel,
1305  __global CLPixelType * restrict stretch_map,
1306  const float4 white, const float4 black)
1307  {
1308  const int x = get_global_id(0);
1309  const int y = get_global_id(1);
1310  const int columns = get_global_size(0);
1311  const int c = x + y * columns;
1312 
1313  uint ePos;
1314  CLPixelType oValue, eValue;
1315  CLQuantum red, green, blue, opacity;
1316 
1317  //read from global
1318  oValue=im[c];
1319 
1320  if ((channel & RedChannel) != 0)
1321  {
1322  if (getRedF4(white) != getRedF4(black))
1323  {
1324  ePos = ScaleQuantumToMap(getRed(oValue));
1325  eValue = stretch_map[ePos];
1326  red = getRed(eValue);
1327  }
1328  }
1329 
1330  if ((channel & GreenChannel) != 0)
1331  {
1332  if (getGreenF4(white) != getGreenF4(black))
1333  {
1334  ePos = ScaleQuantumToMap(getGreen(oValue));
1335  eValue = stretch_map[ePos];
1336  green = getGreen(eValue);
1337  }
1338  }
1339 
1340  if ((channel & BlueChannel) != 0)
1341  {
1342  if (getBlueF4(white) != getBlueF4(black))
1343  {
1344  ePos = ScaleQuantumToMap(getBlue(oValue));
1345  eValue = stretch_map[ePos];
1346  blue = getBlue(eValue);
1347  }
1348  }
1349 
1350  if ((channel & OpacityChannel) != 0)
1351  {
1352  if (getOpacityF4(white) != getOpacityF4(black))
1353  {
1354  ePos = ScaleQuantumToMap(getOpacity(oValue));
1355  eValue = stretch_map[ePos];
1356  opacity = getOpacity(eValue);
1357  }
1358  }
1359 
1360  //write back
1361  im[c]=(CLPixelType)(blue, green, red, opacity);
1362 
1363  }
1364  )
1365 
1366 /*
1367 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1368 % %
1369 % %
1370 % %
1371 % C o n v o l v e %
1372 % %
1373 % %
1374 % %
1375 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1376 */
1377 
1378  STRINGIFY(
1379  __kernel
1380  void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1381  const unsigned int imageWidth, const unsigned int imageHeight,
1382  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1383  const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1384 
1385  int2 blockID;
1386  blockID.x = get_group_id(0);
1387  blockID.y = get_group_id(1);
1388 
1389  // image area processed by this workgroup
1390  int2 imageAreaOrg;
1391  imageAreaOrg.x = blockID.x * get_local_size(0);
1392  imageAreaOrg.y = blockID.y * get_local_size(1);
1393 
1394  int2 midFilterDimen;
1395  midFilterDimen.x = (filterWidth-1)/2;
1396  midFilterDimen.y = (filterHeight-1)/2;
1397 
1398  int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1399 
1400  // dimension of the local cache
1401  int2 cachedAreaDimen;
1402  cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1403  cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1404 
1405  // cache the pixels accessed by this workgroup in local memory
1406  int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1407  int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1408  int groupSize = get_local_size(0) * get_local_size(1);
1409  for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1410 
1411  int2 cachedAreaIndex;
1412  cachedAreaIndex.x = i % cachedAreaDimen.x;
1413  cachedAreaIndex.y = i / cachedAreaDimen.x;
1414 
1415  int2 imagePixelIndex;
1416  imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1417 
1418  // only support EdgeVirtualPixelMethod through ClampToCanvas
1419  // TODO: implement other virtual pixel method
1420  imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1421  imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1422 
1423  pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1424  }
1425 
1426  // cache the filter
1427  for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1428  filterCache[i] = filter[i];
1429  }
1430  barrier(CLK_LOCAL_MEM_FENCE);
1431 
1432 
1433  int2 imageIndex;
1434  imageIndex.x = imageAreaOrg.x + get_local_id(0);
1435  imageIndex.y = imageAreaOrg.y + get_local_id(1);
1436 
1437  // if out-of-range, stops here and quit
1438  if (imageIndex.x >= imageWidth
1439  || imageIndex.y >= imageHeight) {
1440  return;
1441  }
1442 
1443  int filterIndex = 0;
1444  float4 sum = (float4)0.0f;
1445  float gamma = 0.0f;
1446  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1447  int cacheIndexY = get_local_id(1);
1448  for (int j = 0; j < filterHeight; j++) {
1449  int cacheIndexX = get_local_id(0);
1450  for (int i = 0; i < filterWidth; i++) {
1451  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1452  float f = filterCache[filterIndex];
1453 
1454  sum.x += f * p.x;
1455  sum.y += f * p.y;
1456  sum.z += f * p.z;
1457  sum.w += f * p.w;
1458 
1459  gamma += f;
1460  filterIndex++;
1461  cacheIndexX++;
1462  }
1463  cacheIndexY++;
1464  }
1465  }
1466  else {
1467  int cacheIndexY = get_local_id(1);
1468  for (int j = 0; j < filterHeight; j++) {
1469  int cacheIndexX = get_local_id(0);
1470  for (int i = 0; i < filterWidth; i++) {
1471 
1472  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1473  float alpha = QuantumScale*(QuantumRange-p.w);
1474  float f = filterCache[filterIndex];
1475  float g = alpha * f;
1476 
1477  sum.x += g*p.x;
1478  sum.y += g*p.y;
1479  sum.z += g*p.z;
1480  sum.w += f*p.w;
1481 
1482  gamma += g;
1483  filterIndex++;
1484  cacheIndexX++;
1485  }
1486  cacheIndexY++;
1487  }
1488  gamma = PerceptibleReciprocal(gamma);
1489  sum.xyz = gamma*sum.xyz;
1490  }
1491  CLPixelType outputPixel;
1492  outputPixel.x = ClampToQuantum(sum.x);
1493  outputPixel.y = ClampToQuantum(sum.y);
1494  outputPixel.z = ClampToQuantum(sum.z);
1495  outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1496 
1497  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1498  }
1499  )
1500 
1501  STRINGIFY(
1502  __kernel
1503  void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1504  const uint imageWidth, const uint imageHeight,
1505  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1506  const uint matte, const ChannelType channel) {
1507 
1508  int2 imageIndex;
1509  imageIndex.x = get_global_id(0);
1510  imageIndex.y = get_global_id(1);
1511 
1512  /*
1513  unsigned int imageWidth = get_global_size(0);
1514  unsigned int imageHeight = get_global_size(1);
1515  */
1516  if (imageIndex.x >= imageWidth
1517  || imageIndex.y >= imageHeight)
1518  return;
1519 
1520  int2 midFilterDimen;
1521  midFilterDimen.x = (filterWidth-1)/2;
1522  midFilterDimen.y = (filterHeight-1)/2;
1523 
1524  int filterIndex = 0;
1525  float4 sum = (float4)0.0f;
1526  float gamma = 0.0f;
1527  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1528  for (int j = 0; j < filterHeight; j++) {
1529  int2 inputPixelIndex;
1530  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1531  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1532  for (int i = 0; i < filterWidth; i++) {
1533  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1534  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1535 
1536  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1537  float f = filter[filterIndex];
1538 
1539  sum.x += f * p.x;
1540  sum.y += f * p.y;
1541  sum.z += f * p.z;
1542  sum.w += f * p.w;
1543 
1544  gamma += f;
1545 
1546  filterIndex++;
1547  }
1548  }
1549  }
1550  else {
1551 
1552  for (int j = 0; j < filterHeight; j++) {
1553  int2 inputPixelIndex;
1554  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1555  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1556  for (int i = 0; i < filterWidth; i++) {
1557  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1558  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1559 
1560  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1561  float alpha = QuantumScale*(QuantumRange-p.w);
1562  float f = filter[filterIndex];
1563  float g = alpha * f;
1564 
1565  sum.x += g*p.x;
1566  sum.y += g*p.y;
1567  sum.z += g*p.z;
1568  sum.w += f*p.w;
1569 
1570  gamma += g;
1571 
1572 
1573  filterIndex++;
1574  }
1575  }
1576  gamma = PerceptibleReciprocal(gamma);
1577  sum.xyz = gamma*sum.xyz;
1578  }
1579 
1580  CLPixelType outputPixel;
1581  outputPixel.x = ClampToQuantum(sum.x);
1582  outputPixel.y = ClampToQuantum(sum.y);
1583  outputPixel.z = ClampToQuantum(sum.z);
1584  outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1585 
1586  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1587  }
1588  )
1589 
1590 /*
1591 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1592 % %
1593 % %
1594 % %
1595 % D e s p e c k l e %
1596 % %
1597 % %
1598 % %
1599 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1600 */
1601 
1602  STRINGIFY(
1603 
1604  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1605  , const unsigned int imageWidth, const unsigned int imageHeight
1606  , const int2 offset, const int polarity, const int matte) {
1607 
1608  int x = get_global_id(0);
1609  int y = get_global_id(1);
1610 
1611  CLPixelType v = inputImage[y*imageWidth+x];
1612 
1613  int2 neighbor;
1614  neighbor.y = y + offset.y;
1615  neighbor.x = x + offset.x;
1616 
1617  int2 clampedNeighbor;
1618  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1619  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1620 
1621  CLPixelType r = (clampedNeighbor.x == neighbor.x
1622  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1623  :(CLPixelType)0;
1624 
1625  int sv[4];
1626  sv[0] = (int)v.x;
1627  sv[1] = (int)v.y;
1628  sv[2] = (int)v.z;
1629  sv[3] = (int)v.w;
1630 
1631  int sr[4];
1632  sr[0] = (int)r.x;
1633  sr[1] = (int)r.y;
1634  sr[2] = (int)r.z;
1635  sr[3] = (int)r.w;
1636 
1637  if (polarity > 0) {
1638  \n #pragma unroll 4\n
1639  for (unsigned int i = 0; i < 4; i++) {
1640  sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1641  }
1642  }
1643  else {
1644  \n #pragma unroll 4\n
1645  for (unsigned int i = 0; i < 4; i++) {
1646  sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1647  }
1648 
1649  }
1650 
1651  v.x = (CLQuantum)sv[0];
1652  v.y = (CLQuantum)sv[1];
1653  v.z = (CLQuantum)sv[2];
1654 
1655  if (matte!=0)
1656  v.w = (CLQuantum)sv[3];
1657 
1658  outputImage[y*imageWidth+x] = v;
1659 
1660  }
1661 
1662 
1663  )
1664 
1665 
1666 
1667  STRINGIFY(
1668 
1669  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1670  , const unsigned int imageWidth, const unsigned int imageHeight
1671  , const int2 offset, const int polarity, const int matte) {
1672 
1673  int x = get_global_id(0);
1674  int y = get_global_id(1);
1675 
1676  CLPixelType v = inputImage[y*imageWidth+x];
1677 
1678  int2 neighbor, clampedNeighbor;
1679 
1680  neighbor.y = y + offset.y;
1681  neighbor.x = x + offset.x;
1682  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1683  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1684 
1685  CLPixelType r = (clampedNeighbor.x == neighbor.x
1686  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1687  :(CLPixelType)0;
1688 
1689 
1690  neighbor.y = y - offset.y;
1691  neighbor.x = x - offset.x;
1692  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1693  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1694 
1695  CLPixelType s = (clampedNeighbor.x == neighbor.x
1696  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1697  :(CLPixelType)0;
1698 
1699 
1700  int sv[4];
1701  sv[0] = (int)v.x;
1702  sv[1] = (int)v.y;
1703  sv[2] = (int)v.z;
1704  sv[3] = (int)v.w;
1705 
1706  int sr[4];
1707  sr[0] = (int)r.x;
1708  sr[1] = (int)r.y;
1709  sr[2] = (int)r.z;
1710  sr[3] = (int)r.w;
1711 
1712  int ss[4];
1713  ss[0] = (int)s.x;
1714  ss[1] = (int)s.y;
1715  ss[2] = (int)s.z;
1716  ss[3] = (int)s.w;
1717 
1718  if (polarity > 0) {
1719  \n #pragma unroll 4\n
1720  for (unsigned int i = 0; i < 4; i++) {
1721  //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1722  //
1723  //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1724  //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1725  sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1726  }
1727  }
1728  else {
1729  \n #pragma unroll 4\n
1730  for (unsigned int i = 0; i < 4; i++) {
1731  //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1732  //
1733  //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1734  sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1735  }
1736  }
1737 
1738  v.x = (CLQuantum)sv[0];
1739  v.y = (CLQuantum)sv[1];
1740  v.z = (CLQuantum)sv[2];
1741 
1742  if (matte!=0)
1743  v.w = (CLQuantum)sv[3];
1744 
1745  outputImage[y*imageWidth+x] = v;
1746 
1747  }
1748 
1749 
1750  )
1751 
1752 /*
1753 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1754 % %
1755 % %
1756 % %
1757 % E q u a l i z e %
1758 % %
1759 % %
1760 % %
1761 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1762 */
1763 
1764  STRINGIFY(
1765  /*
1766  */
1767  __kernel void Equalize(__global CLPixelType * restrict im,
1768  const ChannelType channel,
1769  __global CLPixelType * restrict equalize_map,
1770  const float4 white, const float4 black)
1771  {
1772  const int x = get_global_id(0);
1773  const int y = get_global_id(1);
1774  const int columns = get_global_size(0);
1775  const int c = x + y * columns;
1776 
1777  uint ePos;
1778  CLPixelType oValue, eValue;
1779  CLQuantum red, green, blue, opacity;
1780 
1781  //read from global
1782  oValue=im[c];
1783 
1784  if ((channel & SyncChannels) != 0)
1785  {
1786  if (getRedF4(white) != getRedF4(black))
1787  {
1788  ePos = ScaleQuantumToMap(getRed(oValue));
1789  eValue = equalize_map[ePos];
1790  red = getRed(eValue);
1791  ePos = ScaleQuantumToMap(getGreen(oValue));
1792  eValue = equalize_map[ePos];
1793  green = getRed(eValue);
1794  ePos = ScaleQuantumToMap(getBlue(oValue));
1795  eValue = equalize_map[ePos];
1796  blue = getRed(eValue);
1797  ePos = ScaleQuantumToMap(getOpacity(oValue));
1798  eValue = equalize_map[ePos];
1799  opacity = getRed(eValue);
1800 
1801  //write back
1802  im[c]=(CLPixelType)(blue, green, red, opacity);
1803  }
1804 
1805  }
1806 
1807  // for equalizing, we always need all channels?
1808  // otherwise something more
1809 
1810  }
1811  )
1812 
1813 /*
1814 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1815 % %
1816 % %
1817 % %
1818 % F u n c t i o n %
1819 % %
1820 % %
1821 % %
1822 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1823 */
1824 
1825  STRINGIFY(
1826 
1827  /*
1828  apply FunctionImageChannel(braightness-contrast)
1829  */
1830  CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
1831  const unsigned int number_parameters,
1832  __constant float *parameters)
1833  {
1834  float4 result = (float4) 0.0f;
1835  switch (function)
1836  {
1837  case PolynomialFunction:
1838  {
1839  for (unsigned int i=0; i < number_parameters; i++)
1840  result = result*(float4)QuantumScale*convert_float4(pixel) + parameters[i];
1841  result *= (float4)QuantumRange;
1842  break;
1843  }
1844  case SinusoidFunction:
1845  {
1846  float freq,phase,ampl,bias;
1847  freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1848  phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1849  ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1850  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1851  result.x = QuantumRange*(ampl*sin(2.0f*MagickPI*
1852  (freq*QuantumScale*(float)pixel.x + phase/360.0f)) + bias);
1853  result.y = QuantumRange*(ampl*sin(2.0f*MagickPI*
1854  (freq*QuantumScale*(float)pixel.y + phase/360.0f)) + bias);
1855  result.z = QuantumRange*(ampl*sin(2.0f*MagickPI*
1856  (freq*QuantumScale*(float)pixel.z + phase/360.0f)) + bias);
1857  result.w = QuantumRange*(ampl*sin(2.0f*MagickPI*
1858  (freq*QuantumScale*(float)pixel.w + phase/360.0f)) + bias);
1859  break;
1860  }
1861  case ArcsinFunction:
1862  {
1863  float width,range,center,bias;
1864  width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1865  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1866  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1867  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1868 
1869  result.x = 2.0f/width*(QuantumScale*(float)pixel.x - center);
1870  result.x = range/MagickPI*asin(result.x)+bias;
1871  result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
1872  result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
1873 
1874  result.y = 2.0f/width*(QuantumScale*(float)pixel.y - center);
1875  result.y = range/MagickPI*asin(result.y)+bias;
1876  result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
1877  result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
1878 
1879  result.z = 2.0f/width*(QuantumScale*(float)pixel.z - center);
1880  result.z = range/MagickPI*asin(result.z)+bias;
1881  result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
1882  result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
1883 
1884 
1885  result.w = 2.0f/width*(QuantumScale*(float)pixel.w - center);
1886  result.w = range/MagickPI*asin(result.w)+bias;
1887  result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
1888  result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
1889 
1890  result *= (float4)QuantumRange;
1891  break;
1892  }
1893  case ArctanFunction:
1894  {
1895  float slope,range,center,bias;
1896  slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1897  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1898  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1899  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1900  result = (float4)MagickPI*(float4)slope*((float4)QuantumScale*convert_float4(pixel)-(float4)center);
1901  result = (float4)QuantumRange*((float4)range/(float4)MagickPI*atan(result) + (float4)bias);
1902  break;
1903  }
1904  case UndefinedFunction:
1905  break;
1906  }
1907  return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
1908  ClampToQuantum(result.z), ClampToQuantum(result.w));
1909  }
1910  )
1911 
1912  STRINGIFY(
1913  /*
1914  Improve brightness / contrast of the image
1915  channel : define which channel is improved
1916  function : the function called to enchance the brightness contrast
1917  number_parameters : numbers of parameters
1918  parameters : the parameter
1919  */
1920  __kernel void ComputeFunction(__global CLPixelType *im,
1921  const ChannelType channel, const MagickFunction function,
1922  const unsigned int number_parameters, __constant float *parameters)
1923  {
1924  const int x = get_global_id(0);
1925  const int y = get_global_id(1);
1926  const int columns = get_global_size(0);
1927  const int c = x + y * columns;
1928  im[c] = ApplyFunction(im[c], function, number_parameters, parameters);
1929  }
1930  )
1931 
1932 /*
1933 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1934 % %
1935 % %
1936 % %
1937 % G r a y s c a l e %
1938 % %
1939 % %
1940 % %
1941 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1942 */
1943 
1944  STRINGIFY(
1945  __kernel void Grayscale(__global CLPixelType *im,
1946  const int method, const int colorspace)
1947  {
1948 
1949  const int x = get_global_id(0);
1950  const int y = get_global_id(1);
1951  const int columns = get_global_size(0);
1952  const int c = x + y * columns;
1953 
1954  CLPixelType pixel = im[c];
1955 
1956  float
1957  blue,
1958  green,
1959  intensity,
1960  red;
1961 
1962  red=(float)getRed(pixel);
1963  green=(float)getGreen(pixel);
1964  blue=(float)getBlue(pixel);
1965 
1966  intensity=0.0;
1967 
1968  CLPixelType filteredPixel;
1969 
1970  switch (method)
1971  {
1973  {
1974  intensity=(red+green+blue)/3.0;
1975  break;
1976  }
1978  {
1979  intensity=MagickMax(MagickMax(red,green),blue);
1980  break;
1981  }
1983  {
1984  intensity=(MagickMin(MagickMin(red,green),blue)+
1985  MagickMax(MagickMax(red,green),blue))/2.0;
1986  break;
1987  }
1989  {
1990  intensity=(float) (((float) red*red+green*green+
1991  blue*blue)/(3.0*QuantumRange));
1992  break;
1993  }
1995  {
1996  /*
1997  if (colorspace == RGBColorspace)
1998  {
1999  red=EncodePixelGamma(red);
2000  green=EncodePixelGamma(green);
2001  blue=EncodePixelGamma(blue);
2002  }
2003  */
2004  intensity=0.298839*red+0.586811*green+0.114350*blue;
2005  break;
2006  }
2008  {
2009  /*
2010  if (image->colorspace == sRGBColorspace)
2011  {
2012  red=DecodePixelGamma(red);
2013  green=DecodePixelGamma(green);
2014  blue=DecodePixelGamma(blue);
2015  }
2016  */
2017  intensity=0.298839*red+0.586811*green+0.114350*blue;
2018  break;
2019  }
2021  default:
2022  {
2023  /*
2024  if (image->colorspace == RGBColorspace)
2025  {
2026  red=EncodePixelGamma(red);
2027  green=EncodePixelGamma(green);
2028  blue=EncodePixelGamma(blue);
2029  }
2030  */
2031  intensity=0.212656*red+0.715158*green+0.072186*blue;
2032  break;
2033  }
2035  {
2036  /*
2037  if (image->colorspace == sRGBColorspace)
2038  {
2039  red=DecodePixelGamma(red);
2040  green=DecodePixelGamma(green);
2041  blue=DecodePixelGamma(blue);
2042  }
2043  */
2044  intensity=0.212656*red+0.715158*green+0.072186*blue;
2045  break;
2046  }
2048  {
2049  intensity=(float) (sqrt((float) red*red+green*green+
2050  blue*blue)/sqrt(3.0));
2051  break;
2052  }
2053 
2054  }
2055 
2056  setGray(&filteredPixel, ClampToQuantum(intensity));
2057 
2058  filteredPixel.w = pixel.w;
2059 
2060  im[c] = filteredPixel;
2061  }
2062  )
2063 
2064 /*
2065 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2066 % %
2067 % %
2068 % %
2069 % L o c a l C o n t r a s t %
2070 % %
2071 % %
2072 % %
2073 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2074 */
2075 
2076  STRINGIFY(
2077  static inline int mirrorBottom(int value)
2078  {
2079  return (value < 0) ? - (value) : value;
2080  }
2081  static inline int mirrorTop(int value, int width)
2082  {
2083  return (value >= width) ? (2 * width - value - 1) : value;
2084  }
2085 
2086  __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
2087  const int radius,
2088  const int imageWidth,
2089  const int imageHeight)
2090  {
2091  const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
2092 
2093  int x = get_local_id(0);
2094  int y = get_global_id(1);
2095 
2096  if ((x >= imageWidth) || (y >= imageHeight))
2097  return;
2098 
2099  global CLPixelType *src = srcImage + y * imageWidth;
2100 
2101  for (int i = x; i < imageWidth; i += get_local_size(0)) {
2102  float sum = 0.0f;
2103  float weight = 1.0f;
2104 
2105  int j = i - radius;
2106  while ((j + 7) < i) {
2107  for (int k = 0; k < 8; ++k) // Unroll 8x
2108  sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
2109  weight += 8.0f;
2110  j+=8;
2111  }
2112  while (j < i) {
2113  sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
2114  weight += 1.0f;
2115  ++j;
2116  }
2117 
2118  while ((j + 7) < radius + i) {
2119  for (int k = 0; k < 8; ++k) // Unroll 8x
2120  sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
2121  weight -= 8.0f;
2122  j+=8;
2123  }
2124  while (j < radius + i) {
2125  sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
2126  weight -= 1.0f;
2127  ++j;
2128  }
2129 
2130  tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
2131  }
2132  }
2133  )
2134 
2135  STRINGIFY(
2136  __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
2137  const int radius,
2138  const float strength,
2139  const int imageWidth,
2140  const int imageHeight)
2141  {
2142  const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
2143 
2144  int x = get_global_id(0);
2145  int y = get_global_id(1);
2146 
2147  if ((x >= imageWidth) || (y >= imageHeight))
2148  return;
2149 
2150  global float *src = blurImage + x;
2151 
2152  float sum = 0.0f;
2153  float weight = 1.0f;
2154 
2155  int j = y - radius;
2156  while ((j + 7) < y) {
2157  for (int k = 0; k < 8; ++k) // Unroll 8x
2158  sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
2159  weight += 8.0f;
2160  j+=8;
2161  }
2162  while (j < y) {
2163  sum += weight * src[mirrorBottom(j) * imageWidth];
2164  weight += 1.0f;
2165  ++j;
2166  }
2167 
2168  while ((j + 7) < radius + y) {
2169  for (int k = 0; k < 8; ++k) // Unroll 8x
2170  sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
2171  weight -= 8.0f;
2172  j+=8;
2173  }
2174  while (j < radius + y) {
2175  sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
2176  weight -= 1.0f;
2177  ++j;
2178  }
2179 
2180  CLPixelType pixel = srcImage[x + y * imageWidth];
2181  float srcVal = dot(RGB, convert_float4(pixel));
2182  float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
2183  mult = (srcVal + mult) / srcVal;
2184 
2185  pixel.x = ClampToQuantum(pixel.x * mult);
2186  pixel.y = ClampToQuantum(pixel.y * mult);
2187  pixel.z = ClampToQuantum(pixel.z * mult);
2188 
2189  dstImage[x + y * imageWidth] = pixel;
2190  }
2191  )
2192 
2193 /*
2194 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2195 % %
2196 % %
2197 % %
2198 % M o d u l a t e %
2199 % %
2200 % %
2201 % %
2202 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2203 */
2204 
2205  STRINGIFY(
2206 
2207  static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
2208  float *hue, float *saturation, float *lightness)
2209  {
2210  float
2211  c,
2212  tmax,
2213  tmin;
2214 
2215  /*
2216  Convert RGB to HSL colorspace.
2217  */
2218  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
2219  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
2220 
2221  c=tmax-tmin;
2222 
2223  *lightness=(tmax+tmin)/2.0;
2224  if (c <= 0.0)
2225  {
2226  *hue=0.0;
2227  *saturation=0.0;
2228  return;
2229  }
2230 
2231  if (tmax == (QuantumScale*red))
2232  {
2233  *hue=(QuantumScale*green-QuantumScale*blue)/c;
2234  if ((QuantumScale*green) < (QuantumScale*blue))
2235  *hue+=6.0;
2236  }
2237  else
2238  if (tmax == (QuantumScale*green))
2239  *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2240  else
2241  *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2242 
2243  *hue*=60.0/360.0;
2244  if (*lightness <= 0.5)
2245  *saturation=c/(2.0*(*lightness));
2246  else
2247  *saturation=c/(2.0-2.0*(*lightness));
2248  }
2249 
2250  static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2251  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2252  {
2253  float
2254  b,
2255  c,
2256  g,
2257  h,
2258  tmin,
2259  r,
2260  x;
2261 
2262  /*
2263  Convert HSL to RGB colorspace.
2264  */
2265  h=hue*360.0;
2266  if (lightness <= 0.5)
2267  c=2.0*lightness*saturation;
2268  else
2269  c=(2.0-2.0*lightness)*saturation;
2270  tmin=lightness-0.5*c;
2271  h-=360.0*floor(h/360.0);
2272  h/=60.0;
2273  x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2274  switch ((int) floor(h) % 6)
2275  {
2276  case 0:
2277  {
2278  r=tmin+c;
2279  g=tmin+x;
2280  b=tmin;
2281  break;
2282  }
2283  case 1:
2284  {
2285  r=tmin+x;
2286  g=tmin+c;
2287  b=tmin;
2288  break;
2289  }
2290  case 2:
2291  {
2292  r=tmin;
2293  g=tmin+c;
2294  b=tmin+x;
2295  break;
2296  }
2297  case 3:
2298  {
2299  r=tmin;
2300  g=tmin+x;
2301  b=tmin+c;
2302  break;
2303  }
2304  case 4:
2305  {
2306  r=tmin+x;
2307  g=tmin;
2308  b=tmin+c;
2309  break;
2310  }
2311  case 5:
2312  {
2313  r=tmin+c;
2314  g=tmin;
2315  b=tmin+x;
2316  break;
2317  }
2318  default:
2319  {
2320  r=0.0;
2321  g=0.0;
2322  b=0.0;
2323  }
2324  }
2325  *red=ClampToQuantum(QuantumRange*r);
2326  *green=ClampToQuantum(QuantumRange*g);
2327  *blue=ClampToQuantum(QuantumRange*b);
2328  }
2329 
2330  static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2331  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2332  {
2333  float
2334  hue,
2335  lightness,
2336  saturation;
2337 
2338  /*
2339  Increase or decrease color lightness, saturation, or hue.
2340  */
2341  ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2342  hue+=0.5*(0.01*percent_hue-1.0);
2343  while (hue < 0.0)
2344  hue+=1.0;
2345  while (hue >= 1.0)
2346  hue-=1.0;
2347  saturation*=0.01*percent_saturation;
2348  lightness*=0.01*percent_lightness;
2349  ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2350  }
2351 
2352  __kernel void Modulate(__global CLPixelType *im,
2353  const float percent_brightness,
2354  const float percent_hue,
2355  const float percent_saturation,
2356  const int colorspace)
2357  {
2358 
2359  const int x = get_global_id(0);
2360  const int y = get_global_id(1);
2361  const int columns = get_global_size(0);
2362  const int c = x + y * columns;
2363 
2364  CLPixelType pixel = im[c];
2365 
2366  CLQuantum
2367  blue,
2368  green,
2369  red;
2370 
2371  red=getRed(pixel);
2372  green=getGreen(pixel);
2373  blue=getBlue(pixel);
2374 
2375  switch (colorspace)
2376  {
2377  case HSLColorspace:
2378  default:
2379  {
2380  ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2381  &red, &green, &blue);
2382  }
2383 
2384  }
2385 
2386  CLPixelType filteredPixel;
2387 
2388  setRed(&filteredPixel, red);
2389  setGreen(&filteredPixel, green);
2390  setBlue(&filteredPixel, blue);
2391  filteredPixel.w = pixel.w;
2392 
2393  im[c] = filteredPixel;
2394  }
2395  )
2396 
2397 /*
2398 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2399 % %
2400 % %
2401 % %
2402 % M o t i o n B l u r %
2403 % %
2404 % %
2405 % %
2406 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2407 */
2408 
2409  STRINGIFY(
2410  __kernel
2411  void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2412  const unsigned int imageWidth, const unsigned int imageHeight,
2413  const __global float *filter, const unsigned int width, const __global int2* offset,
2414  const float4 bias,
2415  const ChannelType channel, const unsigned int matte) {
2416 
2417  int2 currentPixel;
2418  currentPixel.x = get_global_id(0);
2419  currentPixel.y = get_global_id(1);
2420 
2421  if (currentPixel.x >= imageWidth
2422  || currentPixel.y >= imageHeight)
2423  return;
2424 
2425  float4 pixel;
2426  pixel.x = (float)bias.x;
2427  pixel.y = (float)bias.y;
2428  pixel.z = (float)bias.z;
2429  pixel.w = (float)bias.w;
2430 
2431  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2432 
2433  for (int i = 0; i < width; i++) {
2434  // only support EdgeVirtualPixelMethod through ClampToCanvas
2435  // TODO: implement other virtual pixel method
2436  int2 samplePixel = currentPixel + offset[i];
2437  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2438  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2439  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2440 
2441  pixel.x += (filter[i] * (float)samplePixelValue.x);
2442  pixel.y += (filter[i] * (float)samplePixelValue.y);
2443  pixel.z += (filter[i] * (float)samplePixelValue.z);
2444  pixel.w += (filter[i] * (float)samplePixelValue.w);
2445  }
2446 
2447  CLPixelType outputPixel;
2448  outputPixel.x = ClampToQuantum(pixel.x);
2449  outputPixel.y = ClampToQuantum(pixel.y);
2450  outputPixel.z = ClampToQuantum(pixel.z);
2451  outputPixel.w = ClampToQuantum(pixel.w);
2452  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2453  }
2454  else {
2455 
2456  float gamma = 0.0f;
2457  for (int i = 0; i < width; i++) {
2458  // only support EdgeVirtualPixelMethod through ClampToCanvas
2459  // TODO: implement other virtual pixel method
2460  int2 samplePixel = currentPixel + offset[i];
2461  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2462  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2463 
2464  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2465 
2466  float alpha = QuantumScale*(QuantumRange-samplePixelValue.w);
2467  float k = filter[i];
2468  pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2469  pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2470  pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2471 
2472  pixel.w += k * alpha * samplePixelValue.w;
2473 
2474  gamma+=k*alpha;
2475  }
2476  gamma = PerceptibleReciprocal(gamma);
2477  pixel.xyz = gamma*pixel.xyz;
2478 
2479  CLPixelType outputPixel;
2480  outputPixel.x = ClampToQuantum(pixel.x);
2481  outputPixel.y = ClampToQuantum(pixel.y);
2482  outputPixel.z = ClampToQuantum(pixel.z);
2483  outputPixel.w = ClampToQuantum(pixel.w);
2484  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2485  }
2486  }
2487  )
2488 
2489 /*
2490 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2491 % %
2492 % %
2493 % %
2494 % R a d i a l B l u r %
2495 % %
2496 % %
2497 % %
2498 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2499 */
2500 
2501  STRINGIFY(
2502  __kernel void RadialBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
2503  const float4 bias,
2504  const unsigned int channel, const unsigned int matte,
2505  const float2 blurCenter,
2506  __constant float *cos_theta, __constant float *sin_theta,
2507  const unsigned int cossin_theta_size)
2508  {
2509  const int x = get_global_id(0);
2510  const int y = get_global_id(1);
2511  const int columns = get_global_size(0);
2512  const int rows = get_global_size(1);
2513  unsigned int step = 1;
2514  float center_x = (float) x - blurCenter.x;
2515  float center_y = (float) y - blurCenter.y;
2516  float radius = hypot(center_x, center_y);
2517 
2518  //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
2519  float blur_radius = hypot(blurCenter.x, blurCenter.y);
2520 
2521  if (radius > MagickEpsilon)
2522  {
2523  step = (unsigned int) (blur_radius / radius);
2524  if (step == 0)
2525  step = 1;
2526  if (step >= cossin_theta_size)
2527  step = cossin_theta_size-1;
2528  }
2529 
2530  float4 result;
2531  result.x = (float)bias.x;
2532  result.y = (float)bias.y;
2533  result.z = (float)bias.z;
2534  result.w = (float)bias.w;
2535  float normalize = 0.0f;
2536 
2537  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2538  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2539  {
2540  result += convert_float4(im[
2541  ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2542  ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2543  normalize += 1.0f;
2544  }
2545  normalize = PerceptibleReciprocal(normalize);
2546  result = result * normalize;
2547  }
2548  else {
2549  float gamma = 0.0f;
2550  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2551  {
2552  float4 p = convert_float4(im[
2553  ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2554  ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2555 
2556  float alpha = (float)(QuantumScale*(QuantumRange-p.w));
2557  result.x += alpha * p.x;
2558  result.y += alpha * p.y;
2559  result.z += alpha * p.z;
2560  result.w += p.w;
2561  gamma+=alpha;
2562  normalize += 1.0f;
2563  }
2564  gamma = PerceptibleReciprocal(gamma);
2565  normalize = PerceptibleReciprocal(normalize);
2566  result.x = gamma*result.x;
2567  result.y = gamma*result.y;
2568  result.z = gamma*result.z;
2569  result.w = normalize*result.w;
2570  }
2571  filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
2572  ClampToQuantum(result.z), ClampToQuantum(result.w));
2573  }
2574  )
2575 
2576 /*
2577 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2578 % %
2579 % %
2580 % %
2581 % R e s i z e %
2582 % %
2583 % %
2584 % %
2585 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2586 */
2587 
2588  STRINGIFY(
2589  // Based on Box from resize.c
2590  float BoxResizeFilter(const float x)
2591  {
2592  return 1.0f;
2593  }
2594  )
2595 
2596  STRINGIFY(
2597  // Based on CubicBC from resize.c
2598  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2599  {
2600  /*
2601  Cubic Filters using B,C determined values:
2602  Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
2603  Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
2604  Spline B = 1 C = 0 B-Spline Gaussian approximation
2605  Hermite B = 0 C = 0 B-Spline interpolator
2606 
2607  See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2608  Graphics Computer Graphics, Volume 22, Number 4, August 1988
2609  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2610  Mitchell.pdf.
2611 
2612  Coefficents are determined from B,C values:
2613  P0 = ( 6 - 2*B )/6 = coeff[0]
2614  P1 = 0
2615  P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2616  P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2617  Q0 = ( 8*B +24*C )/6 = coeff[3]
2618  Q1 = ( -12*B -48*C )/6 = coeff[4]
2619  Q2 = ( 6*B +30*C )/6 = coeff[5]
2620  Q3 = ( - 1*B - 6*C )/6 = coeff[6]
2621 
2622  which are used to define the filter:
2623 
2624  P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
2625  Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
2626 
2627  which ensures function is continuous in value and derivative (slope).
2628  */
2629  if (x < 1.0)
2630  return(resizeFilterCoefficients[0]+x*(x*
2631  (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2632  if (x < 2.0)
2633  return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2634  (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2635  return(0.0);
2636  }
2637  )
2638 
2639  STRINGIFY(
2640  float Sinc(const float x)
2641  {
2642  if (x != 0.0f)
2643  {
2644  const float alpha=(float) (MagickPI*x);
2645  return sinpi(x)/alpha;
2646  }
2647  return(1.0f);
2648  }
2649  )
2650 
2651  STRINGIFY(
2652  float Triangle(const float x)
2653  {
2654  /*
2655  1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2656  a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
2657  for Sinc().
2658  */
2659  return ((x<1.0f)?(1.0f-x):0.0f);
2660  }
2661  )
2662 
2663 
2664  STRINGIFY(
2665  float Hanning(const float x)
2666  {
2667  /*
2668  Cosine window function:
2669  0.5+0.5*cos(pi*x).
2670  */
2671  const float cosine=cos((MagickPI*x));
2672  return(0.5f+0.5f*cosine);
2673  }
2674  )
2675 
2676  STRINGIFY(
2677  float Hamming(const float x)
2678  {
2679  /*
2680  Offset cosine window function:
2681  .54 + .46 cos(pi x).
2682  */
2683  const float cosine=cos((MagickPI*x));
2684  return(0.54f+0.46f*cosine);
2685  }
2686  )
2687 
2688  STRINGIFY(
2689  float Blackman(const float x)
2690  {
2691  /*
2692  Blackman: 2nd order cosine windowing function:
2693  0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2694 
2695  Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2696  five flops.
2697  */
2698  const float cosine=cos((MagickPI*x));
2699  return(0.34f+cosine*(0.5f+cosine*0.16f));
2700  }
2701  )
2702 
2703 
2704 
2705 
2706  STRINGIFY(
2707  static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2708  {
2709  switch (filterType)
2710  {
2711  /* Call Sinc even for SincFast to get better precision on GPU
2712  and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
2713  case SincWeightingFunction:
2715  return Sinc(x);
2717  return CubicBC(x,filterCoefficients);
2718  case BoxWeightingFunction:
2719  return BoxResizeFilter(x);
2721  return Triangle(x);
2723  return Hanning(x);
2725  return Hamming(x);
2727  return Blackman(x);
2728 
2729  default:
2730  return 0.0f;
2731  }
2732  }
2733  )
2734 
2735 
2736  STRINGIFY(
2737  static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2738  , const ResizeWeightingFunctionType resizeWindowType
2739  , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2740  {
2741  float scale;
2742  float xBlur = fabs(x/resizeFilterBlur);
2743  if (resizeWindowSupport < MagickEpsilon
2744  || resizeWindowType == BoxWeightingFunction)
2745  {
2746  scale = 1.0f;
2747  }
2748  else
2749  {
2750  scale = resizeFilterScale;
2751  scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2752  }
2753  float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2754  return weight;
2755  }
2756 
2757  )
2758 
2759  ;
2760  const char* accelerateKernels2 =
2761 
2762  STRINGIFY(
2763 
2764  static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2765  return (numWorkItems/pixelPerWorkgroup);
2766  }
2767 
2768  // returns the index of the pixel for the current workitem to compute.
2769  // returns -1 if this workitem doesn't need to participate in any computation
2770  static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2771  const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2772  int pixelIndex = itemID/numWorkItemsPerPixel;
2773  pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2774  return pixelIndex;
2775  }
2776 
2777  )
2778 
2779  STRINGIFY(
2780  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2781  void ResizeHorizontalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2782  , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2783  , const int resizeFilterType, const int resizeWindowType
2784  , const __global float* resizeFilterCubicCoefficients
2785  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2786  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2787  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2788 
2789 
2790  // calculate the range of resized image pixels computed by this workgroup
2791  const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2792  const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2793  const unsigned int actualNumPixelToCompute = stopX - startX;
2794 
2795  // calculate the range of input image pixels to cache
2796  float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2797  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2798  scale = PerceptibleReciprocal(scale);
2799 
2800  const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2801  const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2802 
2803  // cache the input pixels into local memory
2804  const unsigned int y = get_global_id(1);
2805  event_t e = async_work_group_copy(inputImageCache,inputImage+y*inputColumns+cacheRangeStartX,cacheRangeEndX-cacheRangeStartX,0);
2806  wait_group_events(1,&e);
2807 
2808  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2809  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2810  {
2811 
2812  const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2813  const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2814  const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2815 
2816  // determine which resized pixel computed by this workitem
2817  const unsigned int itemID = get_local_id(0);
2818  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2819 
2820  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2821 
2822  float4 filteredPixel = (float4)0.0f;
2823  float density = 0.0f;
2824  float gamma = 0.0f;
2825  // -1 means this workitem doesn't participate in the computation
2826  if (pixelIndex != -1) {
2827 
2828  // x coordinated of the resized pixel computed by this workitem
2829  const int x = chunkStartX + pixelIndex;
2830 
2831  // calculate how many steps required for this pixel
2832  const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2833  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2834  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2835  const unsigned int n = stop - start;
2836 
2837  // calculate how many steps this workitem will contribute
2838  unsigned int numStepsPerWorkItem = n / numItems;
2839  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2840 
2841  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2842  if (startStep < n) {
2843  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2844 
2845  unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2846  if (matte == 0) {
2847 
2848  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2849  float4 cp = convert_float4(inputImageCache[cacheIndex]);
2850 
2851  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2852  , (ResizeWeightingFunctionType)resizeWindowType
2853  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2854 
2855  filteredPixel += ((float4)weight)*cp;
2856  density+=weight;
2857  }
2858 
2859 
2860  }
2861  else {
2862  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2863  CLPixelType p = inputImageCache[cacheIndex];
2864 
2865  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2866  , (ResizeWeightingFunctionType)resizeWindowType
2867  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2868 
2869  float alpha = weight * QuantumScale * GetPixelAlpha(p);
2870  float4 cp = convert_float4(p);
2871 
2872  filteredPixel.x += alpha * cp.x;
2873  filteredPixel.y += alpha * cp.y;
2874  filteredPixel.z += alpha * cp.z;
2875  filteredPixel.w += weight * cp.w;
2876 
2877  density+=weight;
2878  gamma+=alpha;
2879  }
2880  }
2881  }
2882  }
2883 
2884  // initialize the accumulators to zero
2885  if (itemID < actualNumPixelInThisChunk) {
2886  outputPixelCache[itemID] = (float4)0.0f;
2887  densityCache[itemID] = 0.0f;
2888  if (matte != 0)
2889  gammaCache[itemID] = 0.0f;
2890  }
2891  barrier(CLK_LOCAL_MEM_FENCE);
2892 
2893  // accumulatte the filtered pixel value and the density
2894  for (unsigned int i = 0; i < numItems; i++) {
2895  if (pixelIndex != -1) {
2896  if (itemID%numItems == i) {
2897  outputPixelCache[pixelIndex]+=filteredPixel;
2898  densityCache[pixelIndex]+=density;
2899  if (matte!=0) {
2900  gammaCache[pixelIndex]+=gamma;
2901  }
2902  }
2903  }
2904  barrier(CLK_LOCAL_MEM_FENCE);
2905  }
2906 
2907  if (itemID < actualNumPixelInThisChunk) {
2908  if (matte==0) {
2909  float density = densityCache[itemID];
2910  float4 filteredPixel = outputPixelCache[itemID];
2911  if (density!= 0.0f && density != 1.0)
2912  {
2913  density = PerceptibleReciprocal(density);
2914  filteredPixel *= (float4)density;
2915  }
2916  filteredImage[y*filteredColumns+chunkStartX+itemID] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
2917  , ClampToQuantum(filteredPixel.y)
2918  , ClampToQuantum(filteredPixel.z)
2919  , ClampToQuantum(filteredPixel.w));
2920  }
2921  else {
2922  float density = densityCache[itemID];
2923  float gamma = gammaCache[itemID];
2924  float4 filteredPixel = outputPixelCache[itemID];
2925 
2926  if (density!= 0.0f && density != 1.0) {
2927  density = PerceptibleReciprocal(density);
2928  filteredPixel *= (float4)density;
2929  gamma *= density;
2930  }
2931  gamma = PerceptibleReciprocal(gamma);
2932 
2933  CLPixelType fp;
2934  fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
2935  , ClampToQuantum(gamma*filteredPixel.y)
2936  , ClampToQuantum(gamma*filteredPixel.z)
2937  , ClampToQuantum(filteredPixel.w));
2938 
2939  filteredImage[y*filteredColumns+chunkStartX+itemID] = fp;
2940 
2941  }
2942  }
2943 
2944  } // end of chunking loop
2945  }
2946  )
2947 
2948 
2949  STRINGIFY(
2950  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2951  void ResizeVerticalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2952  , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2953  , const int resizeFilterType, const int resizeWindowType
2954  , const __global float* resizeFilterCubicCoefficients
2955  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2956  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2957  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2958 
2959 
2960  // calculate the range of resized image pixels computed by this workgroup
2961  const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2962  const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2963  const unsigned int actualNumPixelToCompute = stopY - startY;
2964 
2965  // calculate the range of input image pixels to cache
2966  float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2967  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2968  scale = PerceptibleReciprocal(scale);
2969 
2970  const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2971  const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2972 
2973  // cache the input pixels into local memory
2974  const unsigned int x = get_global_id(0);
2975  event_t e = async_work_group_strided_copy(inputImageCache, inputImage+cacheRangeStartY*inputColumns+x, cacheRangeEndY-cacheRangeStartY, inputColumns, 0);
2976  wait_group_events(1,&e);
2977 
2978  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2979  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2980  {
2981 
2982  const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2983  const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2984  const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2985 
2986  // determine which resized pixel computed by this workitem
2987  const unsigned int itemID = get_local_id(1);
2988  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2989 
2990  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2991 
2992  float4 filteredPixel = (float4)0.0f;
2993  float density = 0.0f;
2994  float gamma = 0.0f;
2995  // -1 means this workitem doesn't participate in the computation
2996  if (pixelIndex != -1) {
2997 
2998  // x coordinated of the resized pixel computed by this workitem
2999  const int y = chunkStartY + pixelIndex;
3000 
3001  // calculate how many steps required for this pixel
3002  const float bisect = (y+0.5)/yFactor+MagickEpsilon;
3003  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
3004  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
3005  const unsigned int n = stop - start;
3006 
3007  // calculate how many steps this workitem will contribute
3008  unsigned int numStepsPerWorkItem = n / numItems;
3009  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
3010 
3011  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
3012  if (startStep < n) {
3013  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
3014 
3015  unsigned int cacheIndex = start+startStep-cacheRangeStartY;
3016  if (matte == 0) {
3017 
3018  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3019  float4 cp = convert_float4(inputImageCache[cacheIndex]);
3020 
3021  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3022  , (ResizeWeightingFunctionType)resizeWindowType
3023  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3024 
3025  filteredPixel += ((float4)weight)*cp;
3026  density+=weight;
3027  }
3028 
3029 
3030  }
3031  else {
3032  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3033  CLPixelType p = inputImageCache[cacheIndex];
3034 
3035  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3036  , (ResizeWeightingFunctionType)resizeWindowType
3037  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3038 
3039  float alpha = weight * QuantumScale * GetPixelAlpha(p);
3040  float4 cp = convert_float4(p);
3041 
3042  filteredPixel.x += alpha * cp.x;
3043  filteredPixel.y += alpha * cp.y;
3044  filteredPixel.z += alpha * cp.z;
3045  filteredPixel.w += weight * cp.w;
3046 
3047  density+=weight;
3048  gamma+=alpha;
3049  }
3050  }
3051  }
3052  }
3053 
3054  // initialize the accumulators to zero
3055  if (itemID < actualNumPixelInThisChunk) {
3056  outputPixelCache[itemID] = (float4)0.0f;
3057  densityCache[itemID] = 0.0f;
3058  if (matte != 0)
3059  gammaCache[itemID] = 0.0f;
3060  }
3061  barrier(CLK_LOCAL_MEM_FENCE);
3062 
3063  // accumulatte the filtered pixel value and the density
3064  for (unsigned int i = 0; i < numItems; i++) {
3065  if (pixelIndex != -1) {
3066  if (itemID%numItems == i) {
3067  outputPixelCache[pixelIndex]+=filteredPixel;
3068  densityCache[pixelIndex]+=density;
3069  if (matte!=0) {
3070  gammaCache[pixelIndex]+=gamma;
3071  }
3072  }
3073  }
3074  barrier(CLK_LOCAL_MEM_FENCE);
3075  }
3076 
3077  if (itemID < actualNumPixelInThisChunk) {
3078  if (matte==0) {
3079  float density = densityCache[itemID];
3080  float4 filteredPixel = outputPixelCache[itemID];
3081  if (density!= 0.0f && density != 1.0)
3082  {
3083  density = PerceptibleReciprocal(density);
3084  filteredPixel *= (float4)density;
3085  }
3086  filteredImage[(chunkStartY+itemID)*filteredColumns+x] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
3087  , ClampToQuantum(filteredPixel.y)
3088  , ClampToQuantum(filteredPixel.z)
3089  , ClampToQuantum(filteredPixel.w));
3090  }
3091  else {
3092  float density = densityCache[itemID];
3093  float gamma = gammaCache[itemID];
3094  float4 filteredPixel = outputPixelCache[itemID];
3095 
3096  if (density!= 0.0f && density != 1.0) {
3097  density = PerceptibleReciprocal(density);
3098  filteredPixel *= (float4)density;
3099  gamma *= density;
3100  }
3101  gamma = PerceptibleReciprocal(gamma);
3102 
3103  CLPixelType fp;
3104  fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
3105  , ClampToQuantum(gamma*filteredPixel.y)
3106  , ClampToQuantum(gamma*filteredPixel.z)
3107  , ClampToQuantum(filteredPixel.w));
3108 
3109  filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
3110 
3111  }
3112  }
3113 
3114  } // end of chunking loop
3115  }
3116  )
3117 
3118 /*
3119 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3120 % %
3121 % %
3122 % %
3123 % U n s h a r p M a s k %
3124 % %
3125 % %
3126 % %
3127 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3128 */
3129 
3130  STRINGIFY(
3131  __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage,
3132  const __global float4 *blurRowData, __global CLPixelType *filtered_im,
3133  const unsigned int imageColumns, const unsigned int imageRows,
3134  __local float4* cachedData, __local float* cachedFilter,
3135  const ChannelType channel, const __global float *filter, const unsigned int width,
3136  const float gain, const float threshold)
3137  {
3138  const unsigned int radius = (width-1)/2;
3139 
3140  // cache the pixel shared by the workgroup
3141  const int groupX = get_group_id(0);
3142  const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
3143  const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
3144 
3145  if (groupStartY >= 0
3146  && groupStopY < imageRows) {
3147  event_t e = async_work_group_strided_copy(cachedData
3148  ,blurRowData+groupStartY*imageColumns+groupX
3149  ,groupStopY-groupStartY,imageColumns,0);
3150  wait_group_events(1,&e);
3151  }
3152  else {
3153  for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
3154  cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
3155  }
3156  barrier(CLK_LOCAL_MEM_FENCE);
3157  }
3158  // cache the filter as well
3159  event_t e = async_work_group_copy(cachedFilter,filter,width,0);
3160  wait_group_events(1,&e);
3161 
3162  // only do the work if this is not a patched item
3163  //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
3164  const int cy = get_global_id(1);
3165 
3166  if (cy < imageRows) {
3167  float4 blurredPixel = (float4) 0.0f;
3168 
3169  int i = 0;
3170 
3171  \n #ifndef UFACTOR \n
3172  \n #define UFACTOR 8 \n
3173  \n #endif \n
3174 
3175  for ( ; i+UFACTOR < width; )
3176  {
3177  \n #pragma unroll UFACTOR \n
3178  for (int j=0; j < UFACTOR; j++, i++)
3179  {
3180  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3181  }
3182  }
3183 
3184  for ( ; i < width; i++)
3185  {
3186  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3187  }
3188 
3189  blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
3190  ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
3191 
3192  float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
3193  float4 outputPixel = inputImagePixel - blurredPixel;
3194 
3195  float quantumThreshold = QuantumRange*threshold;
3196 
3197  int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
3198  outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
3199 
3200  //write back
3201  filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
3202  ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
3203 
3204  }
3205  }
3206  )
3207 
3208 
3209 
3210  STRINGIFY(
3211  __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im,
3212  __constant float *filter,
3213  const unsigned int width,
3214  const unsigned int imageColumns, const unsigned int imageRows,
3215  __local float4 *pixels,
3216  const float gain, const float threshold, const unsigned int justBlur)
3217  {
3218  const int x = get_global_id(0);
3219  const int y = get_global_id(1);
3220 
3221  const unsigned int radius = (width - 1) / 2;
3222 
3223  int row = y - radius;
3224  int baseRow = get_group_id(1) * get_local_size(1) - radius;
3225  int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
3226 
3227  while (row < endRow) {
3228  int srcy = (row < 0) ? -row : row; // mirror pad
3229  srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy;
3230 
3231  float4 value = 0.0f;
3232 
3233  int ix = x - radius;
3234  int i = 0;
3235 
3236  while (i + 7 < width) {
3237  for (int j = 0; j < 8; ++j) { // unrolled
3238  int srcx = ix + j;
3239  srcx = (srcx < 0) ? -srcx : srcx;
3240  srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3241  value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]);
3242  }
3243  ix += 8;
3244  i += 8;
3245  }
3246 
3247  while (i < width) {
3248  int srcx = (ix < 0) ? -ix : ix; // mirror pad
3249  srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3250  value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]);
3251  ++i;
3252  ++ix;
3253  }
3254  pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
3255  row += get_local_size(1);
3256  }
3257 
3258 
3259  barrier(CLK_LOCAL_MEM_FENCE);
3260 
3261 
3262  const int px = get_local_id(0);
3263  const int py = get_local_id(1);
3264  const int prp = get_local_size(0);
3265  float4 value = (float4)(0.0f);
3266 
3267  int i = 0;
3268  while (i + 7 < width) { // unrolled
3269  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3270  value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp];
3271  value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp];
3272  value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp];
3273  value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp];
3274  value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp];
3275  value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp];
3276  value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp];
3277  i += 8;
3278  }
3279  while (i < width) {
3280  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3281  ++i;
3282  }
3283  if ((x < imageColumns) && (y < imageRows)) {
3284  if (justBlur == 0) { // apply sharpening
3285  float4 srcPixel = convert_float4(im[x + y * imageColumns]);
3286  float4 diff = srcPixel - value;
3287 
3288  float quantumThreshold = QuantumRange*threshold;
3289 
3290  int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3291  value = select(srcPixel + diff * gain, srcPixel, mask);
3292  }
3293  filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3));
3294  }
3295  }
3296  )
3297 
3298  STRINGIFY(
3299  __kernel __attribute__((reqd_work_group_size(64, 4, 1))) void WaveletDenoise(__global CLPixelType *srcImage, __global CLPixelType *dstImage,
3300  const float threshold,
3301  const int passes,
3302  const int imageWidth,
3303  const int imageHeight)
3304  {
3305  const int pad = (1 << (passes - 1));
3306  const int tileSize = 64;
3307  const int tileRowPixels = 64;
3308  const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3309 
3310  CLPixelType stage[16];
3311 
3312  local float buffer[64 * 64];
3313 
3314  int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3315  int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3316 
3317  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3318  stage[i / 4] = srcImage[mirrorTop(mirrorBottom(srcx), imageWidth) + (mirrorTop(mirrorBottom(srcy + i) , imageHeight)) * imageWidth];
3319  }
3320 
3321 
3322  for (int channel = 0; channel < 3; ++channel) {
3323  // Load LDS
3324  switch (channel) {
3325  case 0:
3326  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3327  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s0);
3328  break;
3329  case 1:
3330  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3331  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s1);
3332  break;
3333  case 2:
3334  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3335  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s2);
3336  break;
3337  }
3338 
3339 
3340  // Process
3341 
3342  float tmp[16];
3343  float accum[16];
3344  float pixel;
3345 
3346  for (int pass = 0; pass < passes; ++pass) {
3347  const int radius = 1 << pass;
3348  const int x = get_local_id(0);
3349  const float thresh = threshold * noise[pass];
3350 
3351  if (pass == 0)
3352  accum[0] = accum[1] = accum[2] = accum[3] = accum[4] = accum[5] = accum[6] = accum[6] = accum[7] = accum[8] = accum[9] = accum[10] = accum[11] = accum[12] = accum[13] = accum[14] = accum[15] = 0.0f;
3353 
3354  // Snapshot input
3355 
3356  // Apply horizontal hat
3357  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3358  const int offset = i * tileRowPixels;
3359  if (pass == 0)
3360  tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3361  pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3362  barrier(CLK_LOCAL_MEM_FENCE);
3363  buffer[x + offset] = pixel;
3364  }
3365  barrier(CLK_LOCAL_MEM_FENCE);
3366  // Apply vertical hat
3367  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3368  pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3369  float delta = tmp[i / 4] - pixel;
3370  tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3371  if (delta < -thresh)
3372  delta += thresh;
3373  else if (delta > thresh)
3374  delta -= thresh;
3375  else
3376  delta = 0;
3377  accum[i / 4] += delta;
3378 
3379  }
3380  barrier(CLK_LOCAL_MEM_FENCE);
3381  if (pass < passes - 1)
3382  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3383  buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3384  else // last pass
3385  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3386  accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3387  barrier(CLK_LOCAL_MEM_FENCE);
3388  }
3389 
3390  switch (channel) {
3391  case 0:
3392  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3393  stage[i / 4].s0 = ClampToQuantum(accum[i / 4]);
3394  break;
3395  case 1:
3396  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3397  stage[i / 4].s1 = ClampToQuantum(accum[i / 4]);
3398  break;
3399  case 2:
3400  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3401  stage[i / 4].s2 = ClampToQuantum(accum[i / 4]);
3402  break;
3403  }
3404 
3405  barrier(CLK_LOCAL_MEM_FENCE);
3406  }
3407 
3408  // Write from stage to output
3409 
3410  if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3411  //for (int i = pad + get_local_id(1); i < tileSize - pad; i += get_local_size(1)) {
3412  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3413  if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3414  dstImage[srcx + (srcy + i) * imageWidth] = stage[i / 4];
3415  }
3416  }
3417  }
3418  }
3419  )
3420  ;
3421 
3422 #endif // MAGICKCORE_OPENCL_SUPPORT
3423 
3424 #if defined(__cplusplus) || defined(c_plusplus)
3425 }
3426 #endif
3427 
3428 #endif // MAGICKCORE_ACCELERATE_PRIVATE_H
Definition: composite.h:91
Definition: composite.h:94
Definition: composite.h:65
Definition: colorspace.h:44
Definition: resize-private.h:31
Definition: colorspace.h:36
#define SigmaPoisson
Definition: resize-private.h:37
Definition: pixel.h:75
Definition: visual-effects.h:34
Definition: statistic.h:117
Definition: resize-private.h:33
Definition: magick-type.h:185
Definition: composite.h:75
Definition: pixel.h:72
Definition: colorspace.h:40
static void MagickPixelCompositeBlend(const MagickPixelPacket *p, const MagickRealType alpha, const MagickPixelPacket *q, const MagickRealType beta, MagickPixelPacket *composite)
Definition: composite-private.h:138
Definition: composite.h:31
Definition: composite.h:93
Definition: colorspace.h:45
Definition: colorspace.h:33
Definition: composite.h:80
#define SigmaRandom
Definition: composite.h:33
Definition: resize-private.h:40
Definition: composite.h:90
Definition: resize-private.h:29
static MagickRealType ColorDodge(const MagickRealType Sca, const MagickRealType Sa, const MagickRealType Dca, const MagickRealType Da)
Definition: composite.c:287
PixelIntensityMethod
Definition: pixel.h:67
Definition: magick-type.h:174
Definition: composite.h:95
Definition: colorspace.h:59
Definition: magick-type.h:180
Definition: composite.h:59
Definition: composite.h:89
Definition: magick-type.h:169
Definition: composite.h:27
Definition: colorspace.h:41
Definition: colorspace.h:37
static MagickRealType RoundToUnity(const MagickRealType value)
Definition: composite-private.h:33
Definition: composite.h:35
Definition: composite.h:87
#define MagickPI
Definition: image-private.h:40
Definition: colorspace.h:58
Definition: colorspace.h:50
static MagickRealType Hanning(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:282
Definition: colorspace.h:47
Definition: statistic.h:116
Definition: colorspace.h:31
#define MAGICKCORE_QUANTUM_DEPTH
Definition: magick-type.h:28
Definition: composite.h:53
Definition: colorspace.h:35
NoiseType
Definition: visual-effects.h:27
Definition: resize-private.h:38
Definition: pixel.h:77
#define MagickEpsilon
Definition: magick-type.h:115
#define SigmaLaplacian
MagickExport void ConvertRGBToHSL(const Quantum red, const Quantum green, const Quantum blue, double *hue, double *saturation, double *lightness)
Definition: gem.c:1127
Definition: magick-type.h:175
Definition: colorspace.h:48
static Quantum ClampToQuantum(const MagickRealType quantum)
Definition: quantum.h:88
Definition: statistic.h:118
Definition: magick-type.h:187
Definition: visual-effects.h:36
Definition: visual-effects.h:35
Definition: colorspace.h:52
Definition: composite.h:47
static MagickRealType Hamming(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:294
Definition: resize-private.h:41
Definition: composite.h:73
Definition: composite.h:29
Definition: composite.h:72
Definition: composite.h:42
Definition: colorspace.h:43
#define SigmaUniform
Definition: composite.h:97
static void ModulateHSL(const double percent_hue, const double percent_saturation, const double percent_lightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: enhance.c:3584
Definition: colorspace.h:34
Definition: colorspace.h:57
Definition: resize-private.h:30
static double PerceptibleReciprocal(const double x)
Definition: pixel-accessor.h:124
Definition: composite.h:54
#define GetPixelAlpha(pixel)
Definition: pixel-accessor.h:36
Definition: composite.h:38
Definition: composite.h:68
Definition: composite.h:96
Definition: magick-type.h:171
Definition: composite.h:71
Definition: resize-private.h:32
Definition: composite.h:55
Definition: composite.h:56
#define SigmaGaussian
Definition: composite.h:69
Definition: pixel.h:71
static Quantum ApplyFunction(Quantum pixel, const MagickFunction function, const size_t number_parameters, const double *parameters, ExceptionInfo *exception)
Definition: statistic.c:1000
Definition: colorspace.h:38
Definition: pixel.h:70
Definition: composite.h:86
Definition: resize-private.h:36
Definition: colorspace.h:30
#define SigmaMultiplicativeGaussian
Definition: visual-effects.h:30
Definition: composite.h:49
Definition: composite.h:44
#define TauGaussian
MagickExport void ConvertRGBToHSB(const Quantum red, const Quantum green, const Quantum blue, double *hue, double *saturation, double *brightness)
Definition: gem.c:994
Definition: magick-type.h:173
static void Contrast(const int sign, Quantum *red, Quantum *green, Quantum *blue)
Definition: enhance.c:917
Definition: magick-type.h:188
Definition: composite.h:46
Definition: statistic.h:114
Definition: composite.h:28
Definition: magick-type.h:168
Definition: magick-type.h:177
Definition: colorspace.h:54
Definition: magick-type.h:176
Definition: resize-private.h:39
Definition: composite.h:78
Definition: resize-private.h:34
#define QuantumScale
Definition: magick-type.h:120
Definition: colorspace.h:55
Definition: composite.h:62
Definition: colorspace.h:39
#define MaxMap
Definition: magick-type.h:78
Definition: magick-type.h:184
#define MagickMax(x, y)
Definition: image-private.h:36
Definition: composite.h:98
Definition: composite.h:39
static void CompositeColorDodge(const MagickPixelPacket *p, const MagickPixelPacket *q, MagickPixelPacket *composite)
Definition: composite.c:324
MagickExport void ConvertHSBToRGB(const double hue, const double saturation, const double brightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: gem.c:284
Definition: composite.h:45
ChannelType
Definition: magick-type.h:164
Definition: composite.h:70
Definition: colorspace.h:46
Definition: resize-private.h:28
Definition: composite.h:81
Definition: composite.h:41
Definition: composite.h:52
Definition: pixel.h:69
Definition: colorspace.h:49
MagickExport void ConvertHSLToRGB(const double hue, const double saturation, const double lightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: gem.c:460
Definition: composite.h:77
Definition: colorspace.h:53
Definition: composite.h:61
Definition: magick-type.h:170
static void MagickPixelCompositePlus(const MagickPixelPacket *p, const MagickRealType alpha, const MagickPixelPacket *q, const MagickRealType beta, MagickPixelPacket *composite)
Definition: composite-private.h:111
Definition: composite.h:76
Definition: visual-effects.h:32
Definition: magick-type.h:166
Definition: colorspace.h:28
Definition: resize-private.h:42
Definition: composite.h:50
Definition: composite.h:36
Definition: composite.h:43
MagickExport MagickRealType GetPixelIntensity(const Image *image, const PixelPacket *magick_restrict pixel)
Definition: pixel.c:2292
Definition: visual-effects.h:31
static MagickRealType Sinc(const MagickRealType, const ResizeFilter *)
Definition: composite.h:37
Definition: composite.h:60
Definition: statistic.h:115
ResizeWeightingFunctionType
Definition: resize-private.h:25
static MagickRealType Blackman(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:148
Definition: colorspace.h:56
#define MagickMin(x, y)
Definition: image-private.h:37
ColorspaceType
Definition: colorspace.h:25
Definition: composite.h:32
Definition: colorspace.h:29
Definition: composite.h:88
Definition: colorspace.h:42
#define SigmaImpulse
Definition: composite.h:48
Definition: composite.h:64
Definition: magick-type.h:172
Definition: colorspace.h:51
CompositeOperator
Definition: composite.h:25
Definition: composite.h:79
Definition: colorspace.h:62
Definition: magick-type.h:179
Definition: colorspace.h:32
Definition: pixel.h:78
Definition: composite.h:66
Definition: composite.h:30
Definition: colorspace.h:60
Definition: magick-type.h:167
Definition: composite.h:63
Definition: composite.h:58
Definition: composite.h:92
Definition: magick-type.h:186
Definition: composite.h:34
Definition: visual-effects.h:29
static MagickRealType CubicBC(const MagickRealType x, const ResizeFilter *resize_filter)
Definition: resize.c:205
Definition: resize-private.h:27
Definition: composite.h:74
Definition: colorspace.h:27
MagickFunction
Definition: statistic.h:112
Definition: composite.h:40
Definition: composite.h:67
Definition: resize-private.h:35
#define QuantumRange
Definition: magick-type.h:86
static MagickRealType Triangle(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:505
Definition: pixel.h:73
Definition: composite.h:51
Definition: magick-type.h:178
Definition: composite.h:57
Definition: visual-effects.h:33