Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   Related Pages  

F4ThomasLS.hpp

00001 #ifndef __F4ThomasLS_H__
00002 #define __F4ThomasLS_H__
00003 
00004 
00005 
00039 #include "AbsF4TriDiagonalLS.hpp"
00040 
00041 #include <math.h>
00042 #include "sseUtil.h"
00043 
00044 
00045 class F4ThomasLS : public AbsF4TriDiagonalLS
00046 {
00047 private:
00048                                                 
00049 Float4Vector*                   uOut;   
00050 Float4Vector*                   dOut;
00051 Float4Vector*                   lOut;   
00052 
00053 Float4Vector*                   y;
00054 
00055 protected:
00056 
00057 // for parallel executions
00059 inline void computeLRdecomposition()
00060         {
00061 /*      
00062         for(int line=0; line<4; line++)
00063                 {
00064                 dOut->set0(line, 0, A->getD0(line, 0) );
00065 
00066                 for(int j=1; j<N; j++)
00067                         {
00068                         lOut->set(line, j, A->getL(line, j)/dOut->get(line, j) );
00069         
00070                         dOut->set0(line, j, A->getD0(line, j) - lOut->get(line, j)*A->getU(line,j) );
00071                         }
00072                 }
00073 */
00074         float *dout, *ad0, *al, *au, *lout;
00075 //      int j4=4;
00076 
00077 
00078         dout= dOut->get0();
00079         ad0= A->getD0();
00080         al= A->getL();
00081         au= A->getU();
00082         lout= lOut->get(); 
00083 
00084 
00085 /*      prefetcht0(al[j4]);                             // use one time
00086         prefetcht0(ad0[j4]);                    // use one time
00087         prefetcht0(au[j4]);                             // use one time
00088 */
00089         movaps_m2r(ad0[0], xmm0);               // dout0
00090         movaps_r2m(xmm0, dout[0]);      
00091 
00092         for(int j=4; j<N4; j+=4) // , j4+=4)
00093                 {
00094 /*              prefetcht0(al[j4]);                             // use one time
00095                 prefetcht0(ad0[j4]);                            // use one time
00096                 prefetcht0(au[j4]);                             // use one time
00097 */
00098                 movaps_m2r(al[j], xmm1);                // al
00099 //              divps_r2r(xmm0, xmm1);                  // al/dout
00100                 rcpps_r2r(xmm0, xmm3);                  // dout -> 1/dout  division optimisation
00101                 mulps_r2r(xmm3, xmm1);                  // al*(1/dout)
00102                 movaps_r2m(xmm1, lout[j]);              // lout
00103 
00104                 movaps_m2r(ad0[j], xmm0);               // ad0
00105                 movaps_m2r(au[j], xmm2);                // au
00106                 mulps_r2r(xmm2, xmm1);                  // lout*au
00107                 subps_r2r(xmm1, xmm0);                  // ad0 - lout*au
00108                 movaps_r2m(xmm0, dout[j]);
00109                 }
00110         }
00111 
00113 inline void forwardSubstitution()
00114         {
00115 /*      
00116         for(int line=0; line<4; line++)
00117                 {
00118                 y->set0(line, 0, b->get0(line, 0) );
00119 
00120                 for(int j=1; j<N; j++)
00121                         y->set0(line, j, b->get0(line, j) - lOut->get(line, j)*y->get(line, j) );
00122                 }
00123 */
00124         float *yv, *bv, *lout;
00125 //      int j4=4;
00126 
00127         yv= y->get0();
00128         bv= b->get0();
00129         lout= lOut->get();
00130 
00131 /*      prefetcht0(lout[j4]);                   // use one time
00132         prefetcht0(bv[j4]);                             // use one time
00133 */
00134         movaps_m2r(bv[0], xmm0);                // b0
00135         movaps_r2m(xmm0, yv[0]);                // y0
00136 
00137         for(int j=4; j<N4; j+=4)  // , j4+=4)
00138                 {
00139 /*              prefetcht0(lout[j4]);                   // use one time
00140                 prefetcht0(bv[j4]);                     // use one time
00141 */
00142                 movaps_m2r(bv[j], xmm1);        // bj
00143                 movaps_m2r(lout[j], xmm2);      // lout
00144                 mulps_r2r(xmm2, xmm0);          // lout*y
00145                 subps_r2r(xmm0, xmm1);          // bj - lout*y
00146                 movaps_r2r(xmm1, xmm0);
00147                 movaps_r2m(xmm0, yv[j]);        // yj
00148                 }
00149         }
00150 
00152 inline void backwardSubstitution()
00153         {
00154 /*      for(int line=0; line<4; line++)
00155                 {
00156                 x->set(line, N, y->get(line, N)/dOut->get(line, N) );
00157 
00158                 for(int j=N-1; j>0; j--)
00159                         x->set(line, j, (y->get(line,j) - A->getU(line,j)*x->get0(line,j))/dOut->get(line,j) );
00160                 }
00161 */
00162         float *xv, *yv, *dout, *au;
00163 //      int j4=N4-4;
00164 
00165         xv= x->get();
00166         yv= y->get();
00167         dout= dOut->get();
00168         au= A->getU();
00169 
00170         prefetcht1(au[N4-4]);                   // use one time
00171 /*      prefetcht0(au[j4]);                     // use one time
00172         prefetcht0(dout[j4]);                   // use one time
00173         prefetcht0(yv[j4]);                             // use one time
00174 */
00175         movaps_m2r(yv[N4], xmm0);               // y
00176         movaps_m2r(dout[N4], xmm1);             // dout
00177 //      divps_r2r(xmm1, xmm0);                  // y/dout
00178         rcpps_r2r(xmm1, xmm4);                  // dout -> 1/dout  division optimisation
00179         mulps_r2r(xmm4, xmm0);                  // y*(1/dout)
00180 
00181         movaps_r2m(xmm0, xv[N4]);               // x
00182 
00183         for(int j=N4-4; j>0; j -=4) // , j4-=4)
00184                 {
00185                 prefetcht1(au[j-4]);                    // use one time
00186 /*              prefetcht0(au[j4]);                     // use one time
00187                 prefetcht0(dout[j4]);                   // use one time
00188                 prefetcht0(yv[j4]);                     // use one time
00189 */
00190                 movaps_m2r(yv[j], xmm1);        // y
00191                 movaps_m2r(au[j], xmm2);        // au
00192                 movaps_m2r(dout[j], xmm3);      // dout
00193 
00194                 mulps_r2r(xmm0, xmm2);          // x*au
00195                 subps_r2r(xmm2, xmm1);          // y - x*au
00196 //              divps_r2r(xmm3, xmm1);          // (y - x*au)/dout
00197                 rcpps_r2r(xmm3, xmm4);                  // dout -> 1/dout  division optimisation
00198                 mulps_r2r(xmm4, xmm1);          // (y - x*au)*(1/dout)
00199         
00200                 movaps_r2r(xmm1, xmm0);
00201                 movaps_r2m(xmm0, xv[j]);        // x
00202                 }
00203         }               
00204 
00205 // combined LR/forward  reducing data flow
00206 // for parallel executions
00208 inline void computeLR_forward()
00209         {
00210 /*      
00211         for(int line=0; line<4; line++)
00212                 {
00213                 dOut->set0(line, 0, A->getD0(line, 0) );
00214                 y->set0(line, 0, b->get0(line, 0) );
00215 
00216                 for(int j=1; j<N; j++)
00217                         {
00218                         lOut->set(line, j, A->getL(line, j)/dOut->get(line, j) );
00219                         dOut->set0(line, j, A->getD0(line, j) - lOut->get(line, j)*A->getU(line,j) );
00220 
00221                         y->set0(line, j, b->get0(line, j) - lOut->get(line, j)*y->get(line, j) );
00222                         }
00223                 }
00224 
00225 */
00226         float *dout, *ad0, *al, *au,  *yv, *bv;  // lout is not needed
00227 //      int j4=4;
00228 
00229         dout= dOut->get0();
00230         ad0= A->getD0();
00231         al= A->getL();
00232         au= A->getU();
00233 //      lout= lOut->get();                              // not needed 
00234         yv= y->get0();
00235         bv= b->get0();
00236 
00237         prefetcht1(bv[4]);                              // use one time
00238         prefetcht1(ad0[4]);                             // use one time
00239 /*      prefetcht0(al[j4]);                             // use one time
00240         prefetcht0(ad0[j4]);                    // use one time
00241         prefetcht0(au[j4]);                             // use one time
00242         prefetcht0(bv[j4]);                             // use one time
00243 */
00244         movaps_m2r(ad0[0], xmm0);               // dout0
00245         movaps_r2m(xmm0, dout[0]);      
00246 
00247         movaps_m2r(bv[0], xmm4);                // b0
00248         movaps_r2m(xmm4, yv[0]);                // y0
00249 
00250         for(int j=4; j<N4; j+=4) // , j4+=4)
00251                 {
00252                 prefetcht1(bv[j+4]);                            // use one time
00253                 prefetcht1(ad0[j+4]);                           // use one time
00254                 // load next data in Lx
00255 /*              prefetcht0(al[j4]);                     // use one time
00256                 prefetcht0(ad0[j4]);                    // use one time
00257                 prefetcht0(au[j4]);                     // use one time
00258                 prefetcht0(bv[j4]);                     // use one time
00259 */
00260                 movaps_m2r(bv[j], xmm5);        // bj
00261 
00262                 movaps_m2r(al[j], xmm1);                // al
00263 //              divps_r2r(xmm0, xmm1);                  // al/dout
00264                 rcpps_r2r(xmm0, xmm3);                  // dout -> 1/dout  division optimisation
00265                 mulps_r2r(xmm3, xmm1);                  // al*(1/dout)
00266 //              movaps_r2m(xmm1, lout[j]);              // lout                         NOT NEEDED anymore in results
00267 
00268                 movaps_m2r(ad0[j], xmm0);               // ad0
00269                 movaps_m2r(au[j], xmm2);                // au
00270                 mulps_r2r(xmm1, xmm2);                  // lout*au
00271                 subps_r2r(xmm2, xmm0);                  // ad0 - lout*au
00272                 movaps_r2m(xmm0, dout[j]);
00273 
00274                 mulps_r2r(xmm1, xmm4);          // lout*y
00275                 subps_r2r(xmm4, xmm5);          // bj - lout*y
00276                 movaps_r2r(xmm5, xmm4);
00277                 movaps_r2m(xmm4, yv[j]);        // yj
00278                 }
00279         }
00280 
00281 public:
00282         
00283 F4ThomasLS(int asize):AbsF4TriDiagonalLS(asize)
00284         {
00285         uOut= new Float4Vector(N-1);
00286         dOut= new Float4Vector(N);
00287         lOut= new Float4Vector(N-1);
00288 
00289         y= new Float4Vector(N);
00290         }
00291 
00292 F4ThomasLS(int asize, int lines):AbsF4TriDiagonalLS(asize, lines)
00293         {
00294         uOut= new Float4Vector(N-1);
00295         dOut= new Float4Vector(N);
00296         lOut= new Float4Vector(N-1);
00297 
00298         y= new Float4Vector(N);
00299         }
00300 
00301 virtual ~F4ThomasLS()
00302         {
00303         delete uOut; 
00304         delete dOut; 
00305         delete lOut; 
00306         delete y;
00307         }
00308 
00309 virtual void loadA(F4TriDiagonalMatrix* m);
00310 virtual void loadMatrix(Float4Vector* u1, Float4Vector* d1, Float4Vector* l1);
00311 virtual void loadX(Float4Vector* data);
00312 virtual void loadB(Float4Vector* data);
00313 
00314 
00315 inline void solve()
00316         {
00317 //      computeLRdecomposition();
00318 //      forwardSubstitution();
00319 
00320         computeLR_forward();                    // combined optimisation
00321         backwardSubstitution();
00322         }
00323         
00324 virtual void output()
00325         {
00326         printf("F4Thomas ");
00327         AbsF4TriDiagonalLS::output();
00328         }
00329 
00330 virtual void output(FILE* file)
00331         {
00332         fprintf(file, "F4Thomas ");
00333         AbsF4TriDiagonalLS::output(file);
00334         }
00335 
00336 };
00337 
00338 #endif
00339 
00340          
SourceForge.net Logo
Restoreinpaint sourceforge project `C++/Java Image Processing, Restoration, Inpainting Project'.

Bernard De Cuyper: Open Project Leader: Concept, design and development.
Bernard De Cuyper & Eddy Fraiha 2002, 2003. Bernard De Cuyper 2004. Open and free, for friendly usage only.
Modifications on Belgium ground of this piece of artistic work, by governement institutions or companies, must be notified to Bernard De Cuyper.
bern_bdc@hotmail.com