pix_convolve.cpp 24 KB
Newer Older
1 2 3 4 5 6 7 8 9
////////////////////////////////////////////////////////
//
// GEM - Graphics Environment for Multimedia
//
// zmoelnig@iem.kug.ac.at
//
// Implementation file
//
//    Copyright (c) 1997-1998 Mark Danks.
zmoelnig's avatar
zmoelnig committed
10
//    Copyright (c) Günther Geiger.
zmoelnig's avatar
zmoelnig committed
11
//    Copyright (c) 2001-2011 IOhannes m zmölnig. forum::für::umläute. IEM. zmoelnig@iem.at
12
//    Copyright (c) 2002 James Tittle & Chris Clepper
13 14 15 16 17 18
//    For information on usage and redistribution, and for a DISCLAIMER OF ALL
//    WARRANTIES, see the file, "GEM.LICENSE.TERMS" in this distribution.
//
/////////////////////////////////////////////////////////

#include "pix_convolve.h"
19
#include "Gem/Exception.h"
20
#include "Utils/Functions.h"
21

22
CPPEXTERN_NEW_WITH_TWO_ARGS(pix_convolve, t_floatarg, A_DEFFLOAT, t_floatarg, A_DEFFLOAT);
23 24 25 26 27 28 29 30 31

/////////////////////////////////////////////////////////
//
// pix_convolve
//
/////////////////////////////////////////////////////////
// Constructor
//
/////////////////////////////////////////////////////////
32 33 34 35 36
pix_convolve :: pix_convolve(t_floatarg fRow, t_floatarg fCol) :
  m_imatrix(NULL),
  m_irange(255),
  m_rows(0), m_cols(0),
  m_chroma(0)
37
{
zmoelnig's avatar
zmoelnig committed
38 39
  int row = static_cast<int>(fRow);
  int col = static_cast<int>(fCol);
40

41 42
    if (!row || !col )
    {
43
      throw(GemException("matrix must have some dimension"));
44
    }
45

46 47
    if (!(row % 2) || !(col % 2) )
    {
48
      throw(GemException("matrix must have odd dimensions"));
49
    }
50

51 52
    m_rows = row;
    m_cols = col;
53
    m_imatrix = new signed short[m_rows * m_cols];
54 55

    // zero out the matrix
ggeiger's avatar
ggeiger committed
56
    int i;
dheck's avatar
dheck committed
57
    for (i = 0; i < m_cols * m_rows; i++) m_imatrix[i] = 0;
58
    // insert a one for the default center value (identity matrix)
59
    m_imatrix[ ((m_cols / 2 + 1) * m_rows) + (m_rows / 2 + 1) ] = 255;
60

61 62 63 64 65 66 67 68 69 70
    inlet_new(this->x_obj, &this->x_obj->ob_pd, gensym("float"), gensym("ft1"));
    inlet_new(this->x_obj, &this->x_obj->ob_pd, gensym("list"), gensym("matrix"));
}

/////////////////////////////////////////////////////////
// Destructor
//
/////////////////////////////////////////////////////////
pix_convolve :: ~pix_convolve()
{
71
    if (m_imatrix)delete [] m_imatrix;
72 73 74 75 76 77
}

/////////////////////////////////////////////////////////
// processImage
//
/////////////////////////////////////////////////////////
78
void pix_convolve :: calculateRGBA3x3(imageStruct &image,imageStruct &tempImg)
79 80 81
{
  int i;
  int j;
82
//  int k;
83 84 85
  int xsize =  tempImg.xsize;
  int ysize =  tempImg.ysize;
  int size = xsize*ysize - xsize-1;
86
  int csize = tempImg.csize;
87 88 89

  int* src = (int*) tempImg.data;
  int* dest = (int*)image.data;
90 91


cclepper's avatar
cclepper committed
92
//unroll this to do R G B in one pass?? (too many registers?)
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
  i = xsize;
  int* val1 = 0;
  int* val2 = src+i-xsize;
  int* val3 = src+i-xsize+1;
  int* val4 = src+i-1;
  int* val5 = src+i;
  int* val6 = src+i+1;
  int* val7 = src+i+xsize-1;
  int* val8 = src+i+xsize;
  int* val9 = src+i+xsize+1;
  int res;
  for (i=xsize+1;i<size;i++) {
    val1 = val2;
    val2 = val3;
    val3 = src+i-xsize+1;
    val4 = val5;
    val5 = val6;
    val6 = src+i+1;
    val7 = val8;
    val8 = val9;
    val9 = src+i+xsize+1;
    if (i%xsize == 0 || i%xsize == xsize-1) continue;
cclepper's avatar
cclepper committed
115
    #ifndef __APPLE__
116
    for (j=0;j<3;j++)
cclepper's avatar
cclepper committed
117 118 119 120
    #else
    for (j=1;j<4;j++)
    #endif
    {
zmoelnig's avatar
zmoelnig committed
121 122 123 124 125 126 127 128 129 130
      //      res =  m_imatrix[0]*(int)((unsigned char*)val1)[j];
      res =  m_imatrix[0]*static_cast<int>(reinterpret_cast<unsigned char*>(val1)[j]);
      res += m_imatrix[1]*static_cast<int>(reinterpret_cast<unsigned char*>(val2)[j]);
      res += m_imatrix[2]*static_cast<int>(reinterpret_cast<unsigned char*>(val3)[j]);
      res += m_imatrix[3]*static_cast<int>(reinterpret_cast<unsigned char*>(val4)[j]);
      res += m_imatrix[4]*static_cast<int>(reinterpret_cast<unsigned char*>(val5)[j]);
      res += m_imatrix[5]*static_cast<int>(reinterpret_cast<unsigned char*>(val6)[j]);
      res += m_imatrix[6]*static_cast<int>(reinterpret_cast<unsigned char*>(val7)[j]);
      res += m_imatrix[7]*static_cast<int>(reinterpret_cast<unsigned char*>(val8)[j]);
      res += m_imatrix[8]*static_cast<int>(reinterpret_cast<unsigned char*>(val9)[j]);
131 132
      res*=m_irange;
      res>>=16;
133
      ((unsigned char*)dest)[i*csize+j] = CLAMP(res);
134 135 136 137
    }

  }

138

139 140
}

zmoelnig's avatar
zmoelnig committed
141
void pix_convolve :: processRGBAImage(imageStruct &image)
142
{
143
    image.copy2Image(&tempImg);
144 145 146 147 148 149
    int initX = m_rows / 2;
    int initY = m_cols / 2;
    int maxX = tempImg.xsize - initX;
    int maxY = tempImg.ysize - initY;
    int xTimesc = tempImg.xsize * tempImg.csize;
    int initOffset = initY * xTimesc + initX * tempImg.csize;
zmoelnig's avatar
zmoelnig committed
150
    const int csize = tempImg.csize;
151

zmoelnig's avatar
zmoelnig committed
152
    if (m_rows == 3 && m_cols == 3) {
153 154 155 156
      calculateRGBA3x3(image,tempImg);
      return;
    }

157 158 159 160 161
    for (int y = initY; y < maxY; y++)
    {
        int realY = y * xTimesc;
        int offsetY = realY - initOffset;

162 163 164
        for (int x = initX; x < maxX; x++)
        {
            int realPos = x * csize + realY;
165
            int offsetXY = x * csize + offsetY;
166

167 168 169 170 171 172 173 174 175 176 177
            // skip the alpha value
            for (int c = 1; c < csize; c++)
            {
                    int new_val = 0;
                    int offsetXYC = offsetXY + c;
                    for (int matY = 0; matY < m_cols; matY++)
                    {
                        int offsetXYCMat = matY * xTimesc + offsetXYC;
                        int realMatY = matY * m_rows;
                    for (int matX = 0; matX < m_rows; matX++)
                    {
178
                        new_val += (tempImg.data[offsetXYCMat + matX * csize] *
cclepper's avatar
cclepper committed
179
                                        m_imatrix[realMatY + matX])>>8;
180 181
                    }
                    }
182
                    image.data[realPos + c] = CLAMP(new_val);
183 184
                    //removes insult from injury ??
                    // we do not use the m_irange anymore ...  remove it ??
cclepper's avatar
cclepper committed
185

186 187
            }
        }
188
    }
cclepper's avatar
cclepper committed
189

190 191
}

zmoelnig's avatar
zmoelnig committed
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
void pix_convolve :: processGrayImage(imageStruct &image)
{
  const int csize=image.csize;
    image.copy2Image(&tempImg);
    int initX = m_rows / 2;
    int initY = m_cols / 2;
    int maxX = tempImg.xsize - initX;
    int maxY = tempImg.ysize - initY;
    int xTimesc = tempImg.xsize * csize;
    int initOffset = initY * xTimesc + initX * csize;

    for (int y = initY; y < maxY; y++)    {
      int realY = y * xTimesc;
      int offsetY = realY - initOffset;

207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
      for (int x = initX; x < maxX; x++)        {
        int offsetXY = x + offsetY;

        int new_val = 0;
        int offsetXYC = offsetXY;
        for (int matY = 0; matY < m_cols; matY++)   {
          int offsetXYCMat = matY * xTimesc + offsetXYC;
          int realMatY = matY * m_rows;
          for (int matX = 0; matX < m_rows; matX++)     {
            new_val += (tempImg.data[offsetXYCMat + matX] *
                        m_imatrix[realMatY + matX])>>8;
          }
        }
        image.data[x+realY] = CLAMP(new_val);
        //removes insult from injury ??
        // we do not use the m_irange anymore ...  remove it ??
zmoelnig's avatar
zmoelnig committed
223 224 225
      }
    }
}
226

227 228
void pix_convolve :: processYUVImage(imageStruct &image)
{
zmoelnig's avatar
zmoelnig committed
229 230
  image.copy2Image(&tempImg);
  //float range = 1;
cclepper's avatar
cclepper committed
231 232 233 234 235 236
    int initX = m_rows / 2;
    int initY = m_cols / 2;
    int maxX = tempImg.xsize - initX;
    int maxY = tempImg.ysize - initY;
    int xTimesc = tempImg.xsize * tempImg.csize;
    int initOffset = initY * xTimesc + initX * tempImg.csize;
237

cclepper's avatar
cclepper committed
238
 //   calculate3x3YUV(image,tempImg);
239

240
//quick fix for Intel 3x3YUV problems
241
#ifdef __BIG_ENDIAN__
cclepper's avatar
cclepper committed
242 243 244 245
    if (m_rows == 3 && m_cols == 3) {
      calculate3x3YUV(image,tempImg);
      return;
    }
246
#endif
cclepper's avatar
cclepper committed
247
    if (m_chroma) {
zmoelnig's avatar
zmoelnig committed
248
    for (int y = initY; y < maxY; y++)   {
cclepper's avatar
cclepper committed
249 250 251
        int realY = y * xTimesc;
        int offsetY = realY - initOffset;

252 253 254
        for (int x = initX; x < maxX; x++)
        {
            int realPos = x * tempImg.csize + realY;
cclepper's avatar
cclepper committed
255 256
            int offsetXY = x * tempImg.csize + offsetY;

257 258 259 260
            // skip the UV
            for (int c = 1; c < 3; c+=2)
            {
                    int new_val = 0;
cclepper's avatar
cclepper committed
261
                int offsetXYC = offsetXY + c;
262 263 264 265 266 267
                    for (int matY = 0; matY < m_cols; matY++)
                    {
                        int offsetXYCMat = matY * xTimesc + offsetXYC;
                        int realMatY = matY * m_rows;
                    for (int matX = 0; matX < m_rows; matX++)
                    {
cclepper's avatar
cclepper committed
268 269
                      new_val += (tempImg.data[offsetXYCMat + matX * tempImg.csize] *
                                        m_imatrix[realMatY + matX])>>8;
270 271
                    }
                    }
cclepper's avatar
cclepper committed
272 273
                   image.data[realPos + c] = CLAMP(new_val);
                   // image.data[realPos + c-1] = 128;  //remove the U+V
274 275
            }
        }
cclepper's avatar
cclepper committed
276 277 278 279 280 281 282
    }
    }else{
    for (int y = initY; y < maxY; y++)
    {
        int realY = y * xTimesc;
        int offsetY = realY - initOffset;

283 284 285
        for (int x = initX; x < maxX; x++)
        {
            int realPos = x * tempImg.csize + realY;
cclepper's avatar
cclepper committed
286 287
            int offsetXY = x * tempImg.csize + offsetY;

288 289 290 291
            // skip the UV
            for (int c = 1; c < 3; c+=2)
            {
                    int new_val = 0;
cclepper's avatar
cclepper committed
292
                int offsetXYC = offsetXY + c;
293 294 295 296 297 298
                    for (int matY = 0; matY < m_cols; matY++)
                    {
                        int offsetXYCMat = matY * xTimesc + offsetXYC;
                        int realMatY = matY * m_rows;
                    for (int matX = 0; matX < m_rows; matX++)
                    {
cclepper's avatar
cclepper committed
299 300
                      new_val += (tempImg.data[offsetXYCMat + matX * tempImg.csize] *
                                        m_imatrix[realMatY + matX])>>8;
301 302
                    }
                    }
cclepper's avatar
cclepper committed
303 304
                   image.data[realPos + c] = CLAMP(new_val);
                    image.data[realPos + c-1] = 128;  //remove the U+V
305 306
            }
        }
cclepper's avatar
cclepper committed
307 308
    }
    }
309

310
}
cclepper's avatar
cclepper committed
311

312
//make two functions - one for chroma one without
cclepper's avatar
cclepper committed
313 314
void pix_convolve :: calculate3x3YUV(imageStruct &image,imageStruct &tempImg)
{
315

cclepper's avatar
cclepper committed
316
#ifdef __VEC__
317 318 319 320
calculate3x3YUVAltivec(image,tempImg);
return;
#else

cclepper's avatar
cclepper committed
321 322 323
  int i;
  int j;
  int k;
324 325
  int xsize =  tempImg.xsize -1;
  int ysize =  tempImg.ysize -1;
cclepper's avatar
cclepper committed
326
  int size = xsize*ysize - xsize-1;
327
  int length;
cclepper's avatar
cclepper committed
328 329 330

  short* src = (short*) tempImg.data;
  short* dest = (short*)image.data;
331 332 333
  register int mat1,mat2,mat3,mat4,mat5,mat6,mat7,mat8,mat9;
  register int res1,res2,res3,res4,res5,res6,res7,res8,res9;
  register int range;
334

335 336 337 338 339 340 341 342
  mat1 = m_imatrix[0];
  mat2 = m_imatrix[1];
  mat3 = m_imatrix[2];
  mat4 = m_imatrix[3];
  mat5 = m_imatrix[4];
  mat6 = m_imatrix[5];
  mat7 = m_imatrix[6];
  mat8 = m_imatrix[7];
343
  mat9 = m_imatrix[8];
344
  range =m_irange;
345

cclepper's avatar
cclepper committed
346 347
if (m_chroma){
  i = xsize;
348

349
#ifdef i386
350 351
  register unsigned char val1 = 0;
  register unsigned char val2 = src[i-xsize+1];
cclepper's avatar
cclepper committed
352 353 354 355 356 357 358
  register unsigned char val3 = src[i-xsize+3];
  register unsigned char val4 = src[i-1];
  register unsigned char val5 = src[i+1];
  register unsigned char val6 = src[i+3];
  register unsigned char val7 = src[i+xsize-1];
  register unsigned char val8 = src[i+xsize+1];
  register unsigned char val9 = src[i+xsize+3];
359 360 361
#else
  register unsigned char val1 = 0;
  register unsigned char val2 = src[i-xsize+1];
362 363 364 365 366 367 368
  register unsigned char val3 = src[i-xsize+3];
  register unsigned char val4 = src[i-1];
  register unsigned char val5 = src[i+1];
  register unsigned char val6 = src[i+3];
  register unsigned char val7 = src[i+xsize-1];
  register unsigned char val8 = src[i+xsize+1];
  register unsigned char val9 = src[i+xsize+3];
369
#endif
370

cclepper's avatar
cclepper committed
371
  //unroll this 2x to fill the registers? (matrix*y1*y2= 9*9*9 =27)
372 373 374
  //messed up looking on x86
i=xsize+2;

375
          for (k=1;k<ysize;k++) {
376
        for (j=1;j<xsize;j++) {
cclepper's avatar
cclepper committed
377
  //load furthest value first...the rest should be in cache
378

379 380 381 382 383 384 385 386 387
            val7 = val8;
            val8 = val9;
            val9 = src[i+xsize+3]; //this will come from main mem
            val1 = val2;
            val2 = val3;
            val3 = src[i-xsize+3]; //should be in cache from previous pass
            val4 = val5;
            val5 = val6;
            val6 = src[i+3];
388

389
            //unroll??
zmoelnig's avatar
zmoelnig committed
390 391 392 393 394 395 396 397 398
            res1 = mat1*static_cast<int>(val1);
            res2 = mat2*static_cast<int>(val2);
            res3 = mat3*static_cast<int>(val3);
            res4 = mat4*static_cast<int>(val4);
            res5 = mat5*static_cast<int>(val5);
            res6 = mat6*static_cast<int>(val6);
            res7 = mat7*static_cast<int>(val7);
            res8 = mat8*static_cast<int>(val8);
            res9 = mat9*static_cast<int>(val9);
399 400


401 402 403 404
            res1 += res2 + res3;
            res4 += res5 + res6;
            res7 += res8 + res9;
            res1 += res4 + res7;
405

406 407 408 409
            res1*=range;
            res1>>=16;
            ((unsigned char*)dest)[i*2+1] = CLAMP(res1);
            i++;
410

411 412
        }
    i=k*tempImg.xsize;
413
  }
cclepper's avatar
cclepper committed
414
  }else{
415

cclepper's avatar
cclepper committed
416 417
  i = xsize;
  //make these temp register vars rather than pointers?
418 419

  short* val1 = 0;
cclepper's avatar
cclepper committed
420 421 422 423 424 425 426 427
  short* val2 = src+i-xsize; //val2 = src[i-xsize];
  short* val3 = src+i-xsize+1; //val3 = src[i-xsize+1];
  short* val4 = src+i-1; //val4 = src[i-1];
  short* val5 = src+i; //val5 = src[i];
  short* val6 = src+i+1; //val6 = src[i+1];
  short* val7 = src+i+xsize-1; //val7 = src[i+xsize-1];
  short* val8 = src+i+xsize; //val8 = src[i+xsize];
  short* val9 = src+i+xsize+1; //val9 = src[i+xsize+1];
428
  /*
429
  register short* val1 = 0;
430 431 432 433 434 435 436 437
  register short* val2 = src+i-xsize; //val2 = src[i-xsize];
  register short* val3 = src+i-xsize+1; //val3 = src[i-xsize+1];
  register short* val4 = src+i-1; //val4 = src[i-1];
  register short* val5 = src+i; //val5 = src[i];
  register short* val6 = src+i+1; //val6 = src[i+1];
  register short* val7 = src+i+xsize-1; //val7 = src[i+xsize-1];
  register short* val8 = src+i+xsize; //val8 = src[i+xsize];
  register short* val9 = src+i+xsize+1; //val9 = src[i+xsize+1];*/
438
  //int res;
cclepper's avatar
cclepper committed
439 440 441
 // for (i=xsize+1;i<size;i++) {
   for (k=1;k<ysize;k++) {
        for (j=1;j<xsize;j++) {
cclepper's avatar
cclepper committed
442 443 444 445 446 447 448 449
    val1 = val2;
    val2 = val3;
    val3 = src+i-xsize+1;
    val4 = val5;
    val5 = val6;
    val6 = src+i+1;
    val7 = val8;
    val8 = val9;
450 451
    val9 = src+i+xsize+1;

cclepper's avatar
cclepper committed
452
   /* if (i%xsize == 0 || i%xsize == xsize-1) continue;
cclepper's avatar
cclepper committed
453
    #ifndef __APPLE__
454
    for (j=0;j<3;j++)
cclepper's avatar
cclepper committed
455 456 457
    #else
    for (j=1;j<3;j+=2)
    #endif
cclepper's avatar
cclepper committed
458
    { */
459

zmoelnig's avatar
zmoelnig committed
460 461 462 463 464 465 466 467 468
      res1 = mat1*static_cast<int>(reinterpret_cast<unsigned char*>(val1)[j]);
      res2 = mat2*static_cast<int>(reinterpret_cast<unsigned char*>(val2)[j]);
      res3 = mat3*static_cast<int>(reinterpret_cast<unsigned char*>(val3)[j]);
      res4 = mat4*static_cast<int>(reinterpret_cast<unsigned char*>(val4)[j]);
      res5 = mat5*static_cast<int>(reinterpret_cast<unsigned char*>(val5)[j]);
      res6 = mat6*static_cast<int>(reinterpret_cast<unsigned char*>(val6)[j]);
      res7 = mat7*static_cast<int>(reinterpret_cast<unsigned char*>(val7)[j]);
      res8 = mat8*static_cast<int>(reinterpret_cast<unsigned char*>(val8)[j]);
      res9 = mat9*static_cast<int>(reinterpret_cast<unsigned char*>(val9)[j]);
469 470 471 472 473 474 475 476
      res1 += res2 + res3;
      res4 += res5 + res6;
      res7 += res8 + res9;
      res1 += res4 + res7;
      res1*=range;
      res1>>=16;
     // ((unsigned char*)dest)[i*2] = 128;
     // ((unsigned char*)dest)[i*2+2] = 128;
cclepper's avatar
cclepper committed
477 478
      ((unsigned char*)dest)[i*2+1] = CLAMP(res1);
   // }
479
     ((unsigned char*)dest)[i*2] = 128;
cclepper's avatar
cclepper committed
480 481 482 483
     // ((unsigned char*)dest)[i*2+2] = 128;
      i++;
      }
    i=k*tempImg.xsize;
cclepper's avatar
cclepper committed
484 485
  }
  }
486
#endif
cclepper's avatar
cclepper committed
487 488
}

cclepper's avatar
cclepper committed
489
//too many temps for all the registers - reuse some
490 491
void pix_convolve :: calculate3x3YUVAltivec(imageStruct &image,imageStruct &tempImg)
{
cclepper's avatar
cclepper committed
492
 #ifdef __VEC__
493 494
 int h,w,width,i;
 int xsize =  (tempImg.xsize)*2;
495

496 497
   width = (tempImg.xsize)/8;
   //format is U Y V Y
498

499 500
    union
    {
501 502
        short   elements[8];
        vector  signed short v;
503
    }shortBuffer;
504

505 506
    union
    {
507 508
        unsigned int    elements[4];
        vector  unsigned int v;
509
    }intBuffer;
510

511
    vector unsigned char one;
512
    vector signed short mat1,mat2,mat3,mat4,mat5,mat6,mat7,mat8,mat9;
513
    vector unsigned char  val1,val2,val3,val4,val5,val6,val7,val8,val9;
cclepper's avatar
cclepper committed
514
    register vector signed int  res1,res2,res3,res4,res5,res6,res7,res8,res9;
515
    vector signed int  yhi,ylo;
cclepper's avatar
cclepper committed
516
    register vector signed int  res1a,res2a,res3a,res4a,res5a,res6a,res7a,res8a,res9a;
517
    vector unsigned int bitshift;
cclepper's avatar
cclepper committed
518 519 520
    register vector signed short y1,y2,y3,y4,y5,y6,y7,y8,y9,yres,uvres,hiImage,loImage;
    vector signed short range,uvnone,uv128;
    unsigned char *dst =  (unsigned char*) image.data;
cclepper's avatar
cclepper committed
521
    unsigned char *src =  (unsigned char*) tempImg.data;
522

523 524

    one =  vec_splat_u8( 1 );
525

526 527 528 529
    intBuffer.elements[0] = 8;
    //Load it into the vector unit
    bitshift = intBuffer.v;
    bitshift = (vector unsigned int)vec_splat((vector unsigned int)bitshift,0);
530

531 532
     shortBuffer.elements[0] = m_irange;
    range = shortBuffer.v;
533 534
    range = (vector signed short)vec_splat((vector signed short)range, 0);

535 536
     shortBuffer.elements[0] = 128;
    uvnone = shortBuffer.v;
537 538 539
    uvnone = (vector signed short)vec_splat((vector signed short)uvnone, 0);

    //load the matrix values into vectors
540 541 542
    shortBuffer.elements[0] = m_imatrix[0];
    mat1 = shortBuffer.v;
    mat1 = (vector signed short)vec_splat((vector signed short)mat1,0);
543

544 545 546
    shortBuffer.elements[0] = m_imatrix[1];
    mat2 = shortBuffer.v;
    mat2 = (vector signed short)vec_splat((vector signed short)mat2,0);
547

548 549 550
    shortBuffer.elements[0] = m_imatrix[2];
    mat3 = shortBuffer.v;
    mat3 = (vector signed short)vec_splat((vector signed short)mat3,0);
551

552 553 554
    shortBuffer.elements[0] = m_imatrix[3];
    mat4 = shortBuffer.v;
    mat4 = (vector signed short)vec_splat((vector signed short)mat4,0);
555

556 557 558
    shortBuffer.elements[0] = m_imatrix[4];
    mat5 = shortBuffer.v;
    mat5 = (vector signed short)vec_splat((vector signed short)mat5,0);
559

560 561 562
    shortBuffer.elements[0] = m_imatrix[5];
    mat6 = shortBuffer.v;
    mat6 = (vector signed short)vec_splat((vector signed short)mat6,0);
563

564 565 566
    shortBuffer.elements[0] = m_imatrix[6];
    mat7 = shortBuffer.v;
    mat7 = (vector signed short)vec_splat((vector signed short)mat7,0);
567

568 569 570
    shortBuffer.elements[0] = m_imatrix[7];
    mat8 = shortBuffer.v;
    mat8 = (vector signed short)vec_splat((vector signed short)mat8,0);
571

572 573 574
    shortBuffer.elements[0] = m_imatrix[8];
    mat9 = shortBuffer.v;
    mat9 = (vector signed short)vec_splat((vector signed short)mat9,0);
575

cclepper's avatar
cclepper committed
576 577 578
    shortBuffer.elements[0] = 128;
    uv128 = shortBuffer.v;
    uv128 = (vector signed short)vec_splat((vector signed short)uv128,0);
579
    #ifndef PPC970
580
    UInt32                      prefetchSize = GetPrefetchConstant( 16, 1, 256 );
581 582
    vec_dst( src, prefetchSize, 0 );
    vec_dst( dst, prefetchSize, 0 );
583 584
      #endif

cclepper's avatar
cclepper committed
585 586 587 588
    i = xsize*2;

//need to treat the first rows as a special case for accuracy and keep it from crashing
//or just skip the first 2 rows ;)
589

cclepper's avatar
cclepper committed
590
    for ( h=2; h<image.ysize-1; h++){
cclepper's avatar
cclepper committed
591 592 593
   // i+=2; //this gets rid of the echoes but kills the vertical edge-detects???
   i+=8;
        for (w=0; w<width-1; w++)
594
        {
cclepper's avatar
cclepper committed
595
        #ifndef PPC970
596
            vec_dst( src, prefetchSize, 0 );
597
            vec_dst( dst, prefetchSize, 1 );
cclepper's avatar
cclepper committed
598
     #endif
599

cclepper's avatar
cclepper committed
600
            val1 = vec_ld(0,src+(i-xsize-2));//this might crash?
601 602 603
            val2 = vec_ld(0,src+(i-xsize));
            val3 = vec_ld(0,src+(i-xsize+2));
            val4 = vec_ld(0,src+(i-2));
cclepper's avatar
cclepper committed
604
            val5 = vec_ld(0,src+i);
605 606 607
            val6 = vec_ld(0,src+(i+2));
            val7 = vec_ld(0,src+(i+xsize-2));
            val8 = vec_ld(0,src+(i+xsize));
cclepper's avatar
cclepper committed
608
            val9 = vec_ld(0,src+(i+xsize+2));
609

610 611 612 613 614 615 616 617 618 619
            //extract the Y for processing
            y1 = (vector signed short)vec_mulo((vector unsigned char)one,(vector unsigned char)val1);
            y2 = (vector signed short)vec_mulo((vector unsigned char)one,(vector unsigned char)val2);
            y3 = (vector signed short)vec_mulo((vector unsigned char)one,(vector unsigned char)val3);
            y4 = (vector signed short)vec_mulo((vector unsigned char)one,(vector unsigned char)val4);
            y5 = (vector signed short)vec_mulo((vector unsigned char)one,(vector unsigned char)val5);
            y6 = (vector signed short)vec_mulo((vector unsigned char)one,(vector unsigned char)val6);
            y7 = (vector signed short)vec_mulo((vector unsigned char)one,(vector unsigned char)val7);
            y8 = (vector signed short)vec_mulo((vector unsigned char)one,(vector unsigned char)val8);
            y9 = (vector signed short)vec_mulo((vector unsigned char)one,(vector unsigned char)val9);
620

621
            uvres = (vector signed short)vec_mule((vector unsigned char)one,(vector unsigned char)val5);
622

623 624 625 626 627 628 629 630 631 632
            //mult the Y by the matrix coefficient
            res1 = vec_mulo(mat1,y1);
            res2 = vec_mulo(mat2,y2);
            res3 = vec_mulo(mat3,y3);
            res4 = vec_mulo(mat4,y4);
            res5 = vec_mulo(mat5,y5);
            res6 = vec_mulo(mat6,y6);
            res7 = vec_mulo(mat7,y7);
            res8 = vec_mulo(mat8,y8);
            res9 = vec_mulo(mat9,y9);
633

634 635 636 637 638 639 640 641 642
            res1a = vec_mule(mat1,y1);
            res2a = vec_mule(mat2,y2);
            res3a = vec_mule(mat3,y3);
            res4a = vec_mule(mat4,y4);
            res5a = vec_mule(mat5,y5);
            res6a = vec_mule(mat6,y6);
            res7a = vec_mule(mat7,y7);
            res8a = vec_mule(mat8,y8);
            res9a = vec_mule(mat9,y9);
643

cclepper's avatar
cclepper committed
644
            //sum the results - these are only 1 cycle ops so no dependency issues
645 646 647 648 649 650 651 652
            res1 = vec_adds(res1,res2); //1+2
            res3 = vec_adds(res3,res4);//3+4
            res5 = vec_adds(res5,res6);//5+6
            res7 = vec_adds(res7,res8);//7+8
            res1 = vec_adds(res1,res3);//(1+2)+(3+4)
            res7 = vec_adds(res7,res9);//7+8+9
            res1 = vec_adds(res1,res5);//(1+2)+(3+4)+(5+6)
            res1 = vec_adds(res1,res7);//(1+2)+(3+4)+(5+6)+(7+8+9)
653

654 655 656 657 658 659 660 661
            res1a = vec_adds(res1a,res2a); //1+2
            res3a = vec_adds(res3a,res4a);//3+4
            res5a = vec_adds(res5a,res6a);//5+6
            res7a = vec_adds(res7a,res8a);//7+8
            res1a = vec_adds(res1a,res3a);//(1+2)+(3+4)
            res7a = vec_adds(res7a,res9a);//7+8+9
            res1a = vec_adds(res1a,res5a);//(1+2)+(3+4)+(5+6)
            res1a = vec_adds(res1a,res7a);//(1+2)+(3+4)+(5+6)+(7+8+9)
662 663


664 665
            //do the bitshift on the results here??
            res1 = vec_sra(res1,bitshift);
666 667
            res1a = vec_sra(res1a,bitshift);

668 669 670
            //pack back to one short vector??
            yhi = vec_mergeh(res1a,res1);
            ylo = vec_mergel(res1a,res1);
671

cclepper's avatar
cclepper committed
672

673
            yres = vec_packs(yhi,ylo);
674 675


cclepper's avatar
cclepper committed
676
            //combine with the UV
677 678 679
            //vec_mergel + vec_mergeh Y and UV
            hiImage =  vec_mergeh(uvres,yres);
            loImage =  vec_mergel(uvres,yres);
680

681 682 683
          val1 = vec_packsu(hiImage,loImage);
          vec_st(val1,0,dst+i);
           i+=16;
684

685
        }
cclepper's avatar
cclepper committed
686
        i = h * xsize;
cclepper's avatar
cclepper committed
687
        #ifndef PPC970
688
        vec_dss( 0 );
cclepper's avatar
cclepper committed
689
        vec_dss( 1 );
cclepper's avatar
cclepper committed
690
    #endif
691
}  /*end of working altivec function */
692

cclepper's avatar
cclepper committed
693

694 695
#endif
}
cclepper's avatar
cclepper committed
696

697 698 699 700 701 702
/////////////////////////////////////////////////////////
// rangeMess
//
/////////////////////////////////////////////////////////
void pix_convolve :: rangeMess(float range)
{
703
    m_irange = (int)(range*255.f);
704 705 706 707 708 709 710 711 712 713 714
    setPixModified();
}

/////////////////////////////////////////////////////////
// matrixMess
//
/////////////////////////////////////////////////////////
void pix_convolve :: matrixMess(int argc, t_atom *argv)
{
    if (argc != m_cols * m_rows)
    {
715 716
        error("matrix size not correct");
        return;
717
    }
cclepper's avatar
cclepper committed
718

ggeiger's avatar
ggeiger committed
719
    int i;
dheck's avatar
dheck committed
720
    for (i = 0; i < argc; i++) m_imatrix[i] = (int)(atom_getfloat(&argv[i])*255.);
721

cclepper's avatar
cclepper committed
722

723 724 725 726 727 728 729 730 731
    setPixModified();
}

/////////////////////////////////////////////////////////
// static member function
//
/////////////////////////////////////////////////////////
void pix_convolve :: obj_setupCallback(t_class *classPtr)
{
zmoelnig's avatar
zmoelnig committed
732
    class_addmethod(classPtr, reinterpret_cast<t_method>(&pix_convolve::matrixMessCallback),
733
            gensym("matrix"), A_GIMME, A_NULL);
zmoelnig's avatar
zmoelnig committed
734
    class_addmethod(classPtr, reinterpret_cast<t_method>(&pix_convolve::rangeMessCallback),
735
            gensym("ft1"), A_FLOAT, A_NULL);
zmoelnig's avatar
zmoelnig committed
736
    class_addmethod(classPtr, reinterpret_cast<t_method>(&pix_convolve::chromaMessCallback),
737
            gensym("chroma"), A_FLOAT, A_NULL);
738 739 740 741 742
}
void pix_convolve :: matrixMessCallback(void *data, t_symbol *, int argc, t_atom *argv)
{
    GetMyClass(data)->matrixMess(argc, argv);
}
743
void pix_convolve :: rangeMessCallback(void *data, t_float range)
744
{
zmoelnig's avatar
zmoelnig committed
745
    GetMyClass(data)->rangeMess(range);
746
}
cclepper's avatar
cclepper committed
747

748
void pix_convolve :: chromaMessCallback(void *data, t_float value)
cclepper's avatar
cclepper committed
749
{
zmoelnig's avatar
zmoelnig committed
750
    GetMyClass(data)->m_chroma=static_cast<int>(value);
cclepper's avatar
cclepper committed
751
}