Commit 503e2fed authored by cclepper's avatar cclepper
Browse files

more Altivec tweaks


git-svn-id: https://pd-gem.svn.sourceforge.net/svnroot/pd-gem/trunk/Gem@449 49b2a0ca-c541-4d49-a7ef-d785e4371512
parent 01c5e9ea
......@@ -90,7 +90,7 @@ void pix_convolve :: calculateRGBA3x3(imageStruct &image,imageStruct &tempImg)
int* src = (int*) tempImg.data;
int* dest = (int*)image.data;
// MMXSTART;
//unroll this to do R G B in one pass?? (too many registers?)
i = xsize;
int* val1 = 0;
......@@ -391,8 +391,9 @@ i=xsize+1;
register short* val8 = src+i+xsize; //val8 = src[i+xsize];
register short* val9 = src+i+xsize+1; //val9 = src[i+xsize+1];*/
//int res;
for (i=xsize+1;i<size;i++) {
// for (i=xsize+1;i<size;i++) {
for (k=1;k<ysize;k++) {
for (j=1;j<xsize;j++) {
val1 = val2;
val2 = val3;
val3 = src+i-xsize+1;
......@@ -403,13 +404,13 @@ i=xsize+1;
val8 = val9;
val9 = src+i+xsize+1;
if (i%xsize == 0 || i%xsize == xsize-1) continue;
/* if (i%xsize == 0 || i%xsize == xsize-1) continue;
#ifndef MACOSX
for (j=0;j<3;j++)
#else
for (j=1;j<3;j+=2)
#endif
{
{ */
res1 = mat1*(int)((unsigned char*)val1)[j];
res2 = mat2*(int)((unsigned char*)val2)[j];
......@@ -428,30 +429,31 @@ i=xsize+1;
res1>>=16;
// ((unsigned char*)dest)[i*2] = 128;
// ((unsigned char*)dest)[i*2+2] = 128;
((unsigned char*)dest)[i*2+j] = CLAMP(res1);
}
((unsigned char*)dest)[i*2+1] = CLAMP(res1);
// }
((unsigned char*)dest)[i*2] = 128;
((unsigned char*)dest)[i*2+2] = 128;
// ((unsigned char*)dest)[i*2+2] = 128;
i++;
}
i=k*tempImg.xsize;
}
}
#endif
}
//too many temps for all the registers - reuse some
void pix_convolve :: calculate3x3YUVAltivec(imageStruct &image,imageStruct &tempImg)
{
#ifdef ALTIVEC
int h,w,width,i;
int xsize = (tempImg.xsize)*2;
// int xsize = (tempImg.xsize-1);
width = (tempImg.xsize)/8;
//format is U Y V Y
// post("pix_convolve : m_irange %d",m_irange);
union
{
//unsigned int i;
short elements[8];
//vector signed char v;
vector short v;
}shortBuffer;
......@@ -464,17 +466,15 @@ void pix_convolve :: calculate3x3YUVAltivec(imageStruct &image,imageStruct &temp
vector unsigned char one;
vector signed short mat1,mat2,mat3,mat4,mat5,mat6,mat7,mat8,mat9;
vector unsigned char val1,val2,val3,val4,val5,val6,val7,val8,val9;
vector signed int res1,res2,res3,res4,res5,res6,res7,res8,res9;
register vector signed int res1,res2,res3,res4,res5,res6,res7,res8,res9;
vector signed int yhi,ylo;
vector signed int res1a,res2a,res3a,res4a,res5a,res6a,res7a,res8a,res9a;
register vector signed int res1a,res2a,res3a,res4a,res5a,res6a,res7a,res8a,res9a;
vector unsigned int bitshift;
vector signed short y1,y2,y3,y4,y5,y6,y7,y8,y9,yres,uvres,hiImage,loImage;
vector signed short range,uvnone;
// vector unsigned char *dst = (vector unsigned char*) image.data;
unsigned char *dst = (unsigned char*) image.data;
register vector signed short y1,y2,y3,y4,y5,y6,y7,y8,y9,yres,uvres,hiImage,loImage;
vector signed short range,uvnone,uv128;
unsigned char *dst = (unsigned char*) image.data;
unsigned char *src = tempImg.data;
// short* src = (short*) tempImg.data;
// unsigned char *temp;
one = vec_splat_u8( 1 );
......@@ -528,59 +528,38 @@ void pix_convolve :: calculate3x3YUVAltivec(imageStruct &image,imageStruct &temp
mat9 = shortBuffer.v;
mat9 = (vector signed short)vec_splat((vector signed short)mat9,0);
shortBuffer.elements[0] = 128;
uv128 = shortBuffer.v;
uv128 = (vector signed short)vec_splat((vector signed short)uv128,0);
UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 );
vec_dst( src, prefetchSize, 0 );
vec_dst( dst, prefetchSize, 0 );
i = 0;
h =0;
w = 0;
// dst = i;
i = xsize+2;
// i = xsize;
//load our initial values
/* val1 = 0;
val2 = vec_ld(0,src+i-xsize);
val3 = vec_ld(0,src+i-xsize+2);
val4 = vec_ld(0,src+i-2);
val5 = vec_ld(0,src+i);
val6 = vec_ld(0,src+i+2);
val7 = vec_ld(0,src+i+xsize-2);
val8 = vec_ld(0,src+i+xsize);
val9 = vec_ld(0,src+i+xsize+2); */
// post("pix_convolve: h %d w %d total pixels %d ",h,w,i);
for ( h=1; h<image.ysize-1; h++){
for (w=1; w<width; w++)
i = xsize*2;
//need to treat the first rows as a special case for accuracy and keep it from crashing
//or just skip the first 2 rows ;)
for ( h=2; h<image.ysize-1; h++){
i+=2; //this gets rid of the echoes but kills the vertical edge-detects???
for (w=0; w<width; w++)
{
vec_dst( src, prefetchSize, 0 );
vec_dst( dst, prefetchSize, 1 );
/*
//swap around the pixels for this pass
val7 = val8;
val8 = val9;
//temp =src+i+xsize+2;
val9 = vec_ld(0,src+i+xsize+2); //this will come from main mem
val1 = val2;
val2 = val3;
//src+=i-xsize+3;
// temp = src+i-xsize+2;
val3 = vec_ld(0,src+i-xsize+2); //should be in cache from previous pass
val4 = val5;
val5 = val6;
// temp = src+i+2;
val6 = vec_ld(0,src+i+2); */
val1 = vec_ld(0,src+i-xsize-2);
val2 = vec_ld(0,src+i-xsize);
val3 = vec_ld(0,src+i-xsize+2);
val4 = vec_ld(0,src+i-2);
val5 = vec_ld(0,src+i);
val6 = vec_ld(0,src+i+2);
val7 = vec_ld(0,src+i+xsize-2);
val8 = vec_ld(0,src+i+xsize);
val9 = vec_ld(0,src+i+xsize+2);
val1 = vec_ld(0,src+(i-xsize-2));//this might crash?
val2 = vec_ld(0,src+(i-xsize));
val3 = vec_ld(0,src+(i-xsize+2));
val4 = vec_ld(0,src+(i-2));
val5 = vec_ld(0,src+i);
val6 = vec_ld(0,src+(i+2));
val7 = vec_ld(0,src+(i+xsize-2));
val8 = vec_ld(0,src+(i+xsize));
val9 = vec_ld(0,src+(i+xsize+2));
//extract the Y for processing
y1 = (vector signed short)vec_mulo((vector unsigned char)one,(vector unsigned char)val1);
......@@ -616,7 +595,7 @@ void pix_convolve :: calculate3x3YUVAltivec(imageStruct &image,imageStruct &temp
res8a = vec_mule(mat8,y8);
res9a = vec_mule(mat9,y9);
//sum the results
//sum the results - these are only 1 cycle ops so no dependency issues
res1 = vec_adds(res1,res2); //1+2
res3 = vec_adds(res3,res4);//3+4
res5 = vec_adds(res5,res6);//5+6
......@@ -636,15 +615,6 @@ void pix_convolve :: calculate3x3YUVAltivec(imageStruct &image,imageStruct &temp
res1a = vec_adds(res1a,res7a);//(1+2)+(3+4)+(5+6)+(7+8+9)
/* //pack back to one short vector??
yhi = vec_mergeh(res1a,res1);
ylo = vec_mergel(res1a,res1);
yres = vec_packs(yhi,ylo);
//back to ints
res1 = vec_mulo(yres,range);
res1a = vec_mule(yres,range); */
//do the bitshift on the results here??
res1 = vec_sra(res1,bitshift);
res1a = vec_sra(res1a,bitshift);
......@@ -659,25 +629,19 @@ void pix_convolve :: calculate3x3YUVAltivec(imageStruct &image,imageStruct &temp
//vec_mergel + vec_mergeh Y and UV
hiImage = vec_mergeh(uvres,yres);
loImage = vec_mergel(uvres,yres);
// hiImage = vec_mergeh(uvnone,yres);
// loImage = vec_mergel(uvnone,yres);
// dst[0] = vec_packsu(hiImage,loImage);
val1 = vec_packsu(hiImage,loImage);
vec_st(val1,0,dst+i);
i+=16;
// dst+=16;
}
i = h * xsize;
vec_dss( 0 );
vec_dss( 1 );
// i=(h+1)*(xsize+2);
// post("pix_convolve: h %d w %d total pixels %d ",h,w,i);
//dst+=16;
//i+=16;
// dst++;
} /*end of working altivec function */
#endif
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment