00001 #ifndef __Float4Vector_H__ 00002 #define __Float4Vector_H__ 00003 00004 00005 00024 #include <stdio.h> 00025 #include <math.h> 00026 00027 #include "sseUtil.h" 00028 00029 00030 class Float4Vector 00031 { 00032 private: 00033 00034 00035 int dataSize4; 00036 int dataSize; 00037 00038 float* palign; 00039 00040 float* data; 00041 float* data1; 00042 00043 public: 00044 00045 Float4Vector(int size); 00046 Float4Vector(int size, float adata); 00047 Float4Vector(const Float4Vector& v); 00048 Float4Vector operator=(const Float4Vector& v); 00049 00050 ~Float4Vector(){ data=0; delete data1; delete[] palign; } 00051 00052 00053 inline int size(){ return dataSize; } 00054 inline int sizeData(){ return dataSize4; } 00055 00056 void load(Float4Vector* v); 00057 00058 inline float* getData(){return data;} 00059 00060 inline float* get0(){return data;} 00061 inline float* get(){return data1;} 00062 00063 // base 0 00064 inline void set0(int line, int index, float value){ data[(index << 2)+line]= value; } 00065 inline float get0(int line, int index){ return data[(index << 2)+line]; } 00066 00067 // base 1 00068 inline void set(int line, int index, float value){ data1[(index << 2)+line]= value; } 00069 inline float get(int line, int index){ return data1[(index << 2)+line]; } 00070 00071 void setAll(float value); 00072 00073 // computations 00074 float vT_v(int line); 00075 float norme2(int line); 00076 float sum(int line); 00077 00078 float minimum(int line); 00079 float maximum(int line); 00080 float mean(int line){ return sum(line)/dataSize; } 00081 float sigma(int line); 00082 00083 00084 inline SSE4 vT_v() 00085 { 00086 SSE4 val; 00087 xorps_r2r(xmm0, xmm0); // 0 00088 00089 for(int i=0; i<dataSize4; i+=4) 00090 { 00091 movaps_m2r(data[i],xmm1); // get 4 data 00092 mulps_r2r(xmm1, xmm1); // sqr 00093 addps_r2r(xmm1, xmm0); // sum of sqr 00094 } 00095 00096 movaps_r2m(xmm0, val.m); // get 4 sum of square in one 00097 00098 return val; 00099 } 00100 00101 inline SSE4 norme2() 00102 { 00103 SSE4 val; 00104 xorps_r2r(xmm0, xmm0); // 0 00105 00106 for(int i=0; i<dataSize4; i+=4) 00107 { 00108 movaps_m2r(data[i], xmm1); // get 4 data 00109 00110 mulps_r2r(xmm1, xmm1); // sqr 00111 addps_r2r(xmm1, xmm0); // sum of sqr 00112 } 00113 00114 sqrtps_r2r(xmm0,xmm0); // norme 2 00115 movaps_r2m(xmm0, val.m); // get 4 results 00116 00117 return val; 00118 } 00119 00120 inline SSE4 sum() 00121 { 00122 SSE4 val; 00123 xorps_r2r(xmm0, xmm0); // 0 00124 00125 00126 for(int i=0; i<dataSize4; i+=4) 00127 { 00128 movaps_m2r(data[i],xmm1); // get 4 data 00129 addps_r2r(xmm1, xmm0); // sum 00130 } 00131 00132 movaps_r2m(xmm0, val.m); // get 4 sum 00133 00134 return val; 00135 } 00136 00137 inline SSE4 minimum() 00138 { 00139 SSE4 val; 00140 movaps_m2r(data[0], xmm0); 00141 00142 for(int i=4; i<dataSize4; i+=4) 00143 { 00144 movaps_m2r(data[i],xmm1); // get 4 data 00145 minps_r2r(xmm1, xmm0); // min 00146 } 00147 00148 movaps_r2m(xmm0, val.m); // get result 00149 00150 return val; 00151 } 00152 00153 inline SSE4 maximum() 00154 { 00155 SSE4 val; 00156 movaps_m2r(data[0], xmm0); 00157 00158 for(int i=4; i<dataSize4; i+=4) 00159 { 00160 movaps_m2r(data[i],xmm1); // get 4 data 00161 maxps_r2r(xmm1, xmm0); // min 00162 } 00163 00164 movaps_r2m(xmm0, val.m); // get result 00165 00166 return val; 00167 } 00168 00169 inline SSE4 mean() 00170 { 00171 SSE4 val; 00172 xorps_r2r(xmm0, xmm0); // EX 00173 00174 val.f[0]= dataSize; 00175 val.f[1]= dataSize; 00176 val.f[2]= dataSize; 00177 val.f[3]= dataSize; 00178 movaps_m2r(val.m, xmm4); 00179 00180 for(int i=0; i<dataSize4; i+=4) 00181 { 00182 movaps_m2r(data[i],xmm2); // get 4 data x 00183 00184 addps_r2r(xmm2, xmm0); // sum 00185 } 00186 00187 divps_r2r(xmm4, xmm0); // EX 00188 00189 movaps_r2m(xmm0, val.m); // get 4 sum of square in one 00190 00191 return val; 00192 } 00193 00194 inline SSE4 sigma() 00195 { 00196 SSE4 val; 00197 xorps_r2r(xmm0, xmm0); // EX 00198 xorps_r2r(xmm1, xmm1); // EX2 00199 00200 val.f[0]= dataSize; 00201 val.f[1]= dataSize; 00202 val.f[2]= dataSize; 00203 val.f[3]= dataSize; 00204 movaps_m2r(val.m, xmm4); 00205 00206 for(int i=0; i<dataSize4; i+=4) 00207 { 00208 movaps_m2r(data[i],xmm2); // get 4 data x 00209 00210 movaps_r2r(xmm2, xmm3); 00211 mulps_r2r(xmm2, xmm3); // x2 00212 00213 addps_r2r(xmm2, xmm0); // sum 00214 addps_r2r(xmm3, xmm1); // sum of sqr 00215 } 00216 00217 divps_r2r(xmm4, xmm0); // EX 00218 divps_r2r(xmm4, xmm1); // EX2 00219 00220 mulps_r2r(xmm0, xmm0); // EX 00221 subps_r2r(xmm0, xmm0); 00222 sqrtps_r2r(xmm0,xmm0); // 4 sigma 00223 00224 movaps_r2m(xmm0, val.m); // get 4 sum of square in one 00225 00226 return val; 00227 } 00228 00229 00230 // math vectorial op 00231 Float4Vector sqrt() 00232 { 00233 Float4Vector v(dataSize); 00234 00235 #ifndef SSE_USED 00236 for(int i=0; i<dataSize; i++) 00237 for(int line=0; line<4; line++) 00238 v.set0(line, i, ::sqrt(data[(i << 2)+line]) ); 00239 #else 00240 float* vdata= v.getData(); 00241 00242 for(int i=0; i<dataSize4; i+=4) 00243 { 00244 movaps_m2r(data[i],xmm1); 00245 sqrtps_r2r(xmm1, xmm0); 00246 movaps_r2m(xmm0, vdata[i]); 00247 } 00248 #endif 00249 00250 return v; 00251 } 00252 00253 Float4Vector pow(float pow); 00254 00255 Float4Vector sqr() 00256 { 00257 Float4Vector v(dataSize); 00258 00259 #ifndef SSE_USED 00260 for(int i=0; i<dataSize; i++) 00261 for(int line=0; line<4; line++) 00262 v.set0(line, i, data[(i << 2)+line]*data[(i << 2)+line] ); 00263 #else 00264 float* vdata= v.getData(); 00265 00266 for(int i=0; i<dataSize4; i+=4) 00267 { 00268 movaps_m2r(data[i],xmm1); 00269 movaps_r2r(xmm1, xmm0); 00270 mulps_r2r(xmm1, xmm0); 00271 movaps_r2m(xmm0, vdata[i]); 00272 } 00273 #endif 00274 00275 return v; 00276 } 00277 00278 Float4Vector exp(); 00279 Float4Vector log(); 00280 Float4Vector log(float base); 00281 00282 Float4Vector sin(); 00283 Float4Vector cos(); 00284 Float4Vector tan(); 00285 00286 00287 // vector single operations 00288 void operator+=(Float4Vector& v) 00289 { 00290 #ifndef SSE_USED 00291 for(int i=0; i<dataSize; i++) 00292 for(int line=0; line<4; line++) 00293 data[(i << 2)+line] += v.get0(line,i); 00294 #else 00295 float* vdata= v.getData(); 00296 00297 for(int i=0; i<dataSize4; i+=4) 00298 { 00299 movaps_m2r(data[i], xmm0); 00300 movaps_m2r(vdata[i],xmm1); 00301 addps_r2r(xmm1, xmm0); 00302 movaps_r2m(xmm0, data[i]); 00303 } 00304 #endif 00305 } 00306 00307 void operator-=(Float4Vector& v) 00308 { 00309 #ifndef SSE_USED 00310 for(int i=0; i<dataSize; i++) 00311 for(int line=0; line<4; line++) 00312 data[(i << 2)+line] -= v.get0(line,i); 00313 #else 00314 float* vdata= v.getData(); 00315 00316 for(int i=0; i<dataSize4; i+=4) 00317 { 00318 movaps_m2r(data[i], xmm0); 00319 movaps_m2r(vdata[i],xmm1); 00320 subps_r2r(xmm1, xmm0); 00321 movaps_r2m(xmm0, data[i]); 00322 } 00323 #endif 00324 } 00325 00326 void operator*=(Float4Vector& v) 00327 { 00328 #ifndef SSE_USED 00329 for(int i=0; i<dataSize; i++) 00330 for(int line=0; line<4; line++) 00331 data[(i << 2)+line] *= v.get0(line,i); 00332 #else 00333 float* vdata= v.getData(); 00334 00335 for(int i=0; i<dataSize4; i+=4) 00336 { 00337 movaps_m2r(data[i], xmm0); 00338 movaps_m2r(vdata[i],xmm1); 00339 mulps_r2r(xmm1, xmm0); 00340 movaps_r2m(xmm0, data[i]); 00341 } 00342 #endif 00343 } 00344 00345 void operator/=(Float4Vector& v) 00346 { 00347 #ifndef SSE_USED 00348 for(int i=0; i<dataSize; i++) 00349 for(int line=0; line<4; line++) 00350 data[(i << 2)+line] /= v.get0(line,i); 00351 #else 00352 float* vdata= v.getData(); 00353 00354 for(int i=0; i<dataSize4; i+=4) 00355 { 00356 movaps_m2r(data[i], xmm0); 00357 movaps_m2r(vdata[i],xmm1); 00358 divps_r2r(xmm1, xmm0); 00359 movaps_r2m(xmm0, data[i]); 00360 } 00361 #endif 00362 } 00363 00364 // friends 00365 00366 friend Float4Vector operator+(Float4Vector& v, float a); // v<op>a 00367 friend Float4Vector operator+(float a, Float4Vector& v); // a<op>v 00368 friend Float4Vector operator-(Float4Vector& v, float a); // v<op>a 00369 friend Float4Vector operator-(float a, Float4Vector& v); // a<op>v 00370 friend Float4Vector operator*(Float4Vector& v, float a); // v<op>a 00371 friend Float4Vector operator*(float a, Float4Vector& v); // a<op>v 00372 friend Float4Vector operator/(Float4Vector& v, float a); // v<op>a 00373 00374 friend Float4Vector operator+(Float4Vector& v1, Float4Vector& v2); // v1<op>v2 00375 friend Float4Vector operator-(Float4Vector& v1, Float4Vector& v2); // v1<op>v2 00376 00377 void output(); 00378 void output(FILE* file); 00379 }; 00380 00381 /* 00382 static inline void Float4Vector_c_norme2_4(float* dataIn, int dataSize4, float** val) 00383 { 00384 xorps_r2r(xmm0, xmm0); // 0 00385 00386 for(int i=0; i<dataSize4; i+=4) 00387 { 00388 movaps_m2r(dataIn[i], xmm1); // get 4 data 00389 00390 // mulps_r2r(xmm1, xmm1); // sqr 00391 addps_r2r(xmm1, xmm0); // sum of sqr 00392 } 00393 00394 // sqrtps_r2r(xmm0,xmm0); // norme 2 00395 // movaps_r2m(xmm0, *val[0]); // get 4 sum of square in one 00396 } 00397 */ 00398 #endif 00399 00400