44 #ifndef KLU2_TSOLVE_HPP 
   45 #define KLU2_TSOLVE_HPP 
   47 #include "klu2_internal.h" 
   50 template <
typename Entry, 
typename Int>
 
   54     KLU_symbolic<Entry, Int> *Symbolic,
 
   55     KLU_numeric<Entry, Int> *Numeric,
 
   68     KLU_common<Entry, Int> *Common
 
   71     Entry x [4], offik, s ;
 
   73     Entry *Offx, *X, *Bz, *Udiag ;
 
   74     Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
 
   76     Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
 
   86     if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
 
   89         Common->status = KLU_INVALID ;
 
   92     Common->status = KLU_OK ;
 
  100     nblocks = Symbolic->nblocks ;
 
  108     ASSERT (nblocks == Numeric->nblocks) ;
 
  109     Pnum = Numeric->Pnum ;
 
  110     Offp = Numeric->Offp ;
 
  111     Offi = Numeric->Offi ;
 
  112     Offx = (Entry *) Numeric->Offx ;
 
  115     Llen = Numeric->Llen ;
 
  117     Ulen = Numeric->Ulen ;
 
  118     LUbx = (Unit **) Numeric->LUbx ;
 
  119     Udiag = (Entry *) Numeric->Udiag ;
 
  122     X = (Entry *) Numeric->Xwork ;
 
  123     ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
 
  129     for (chunk = 0 ; chunk < nrhs ; chunk += 4)
 
  136         nr = MIN (nrhs - chunk, 4) ;
 
  147                 for (k = 0 ; k < n ; k++)
 
  155                 for (k = 0 ; k < n ; k++)
 
  159                     X [2*k + 1] = Bz [i + d  ] ;
 
  165                 for (k = 0 ; k < n ; k++)
 
  169                     X [3*k + 1] = Bz [i + d  ] ;
 
  170                     X [3*k + 2] = Bz [i + d*2] ;
 
  176                 for (k = 0 ; k < n ; k++)
 
  180                     X [4*k + 1] = Bz [i + d  ] ;
 
  181                     X [4*k + 2] = Bz [i + d*2] ;
 
  182                     X [4*k + 3] = Bz [i + d*3] ;
 
  192         for (block = 0 ; block < nblocks ; block++)
 
  202             PRINTF ((
"tsolve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
 
  215                         for (k = k1 ; k < k2 ; k++)
 
  218                             for (p = Offp [k] ; p < pend ; p++)
 
  223                                     MULT_SUB_CONJ (X [k], X [Offi [p]],
 
  229                                     MULT_SUB (X [k], Offx [p], X [Offi [p]]) ;
 
  237                         for (k = k1 ; k < k2 ; k++)
 
  241                             x [1] = X [2*k + 1] ;
 
  242                             for (p = Offp [k] ; p < pend ; p++)
 
  248                                     CONJ (offik, Offx [p]) ;
 
  255                                 MULT_SUB (x [0], offik, X [2*i]) ;
 
  256                                 MULT_SUB (x [1], offik, X [2*i + 1]) ;
 
  259                             X [2*k + 1] = x [1] ;
 
  265                         for (k = k1 ; k < k2 ; k++)
 
  269                             x [1] = X [3*k + 1] ;
 
  270                             x [2] = X [3*k + 2] ;
 
  271                             for (p = Offp [k] ; p < pend ; p++)
 
  277                                     CONJ (offik, Offx [p]) ;
 
  284                                 MULT_SUB (x [0], offik, X [3*i]) ;
 
  285                                 MULT_SUB (x [1], offik, X [3*i + 1]) ;
 
  286                                 MULT_SUB (x [2], offik, X [3*i + 2]) ;
 
  289                             X [3*k + 1] = x [1] ;
 
  290                             X [3*k + 2] = x [2] ;
 
  296                         for (k = k1 ; k < k2 ; k++)
 
  300                             x [1] = X [4*k + 1] ;
 
  301                             x [2] = X [4*k + 2] ;
 
  302                             x [3] = X [4*k + 3] ;
 
  303                             for (p = Offp [k] ; p < pend ; p++)
 
  309                                     CONJ(offik, Offx [p]) ;
 
  316                                 MULT_SUB (x [0], offik, X [4*i]) ;
 
  317                                 MULT_SUB (x [1], offik, X [4*i + 1]) ;
 
  318                                 MULT_SUB (x [2], offik, X [4*i + 2]) ;
 
  319                                 MULT_SUB (x [3], offik, X [4*i + 3]) ;
 
  322                             X [4*k + 1] = x [1] ;
 
  323                             X [4*k + 2] = x [2] ;
 
  324                             X [4*k + 3] = x [3] ;
 
  339                     CONJ (s, Udiag [k1]) ;
 
  350                         DIV (X [k1], X [k1], s) ;
 
  354                         DIV (X [2*k1], X [2*k1], s) ;
 
  355                         DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
 
  359                         DIV (X [3*k1], X [3*k1], s) ;
 
  360                         DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
 
  361                         DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
 
  365                         DIV (X [4*k1], X [4*k1], s) ;
 
  366                         DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
 
  367                         DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
 
  368                         DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
 
  375                 KLU_utsolve (nk, Uip + k1, Ulen + k1, LUbx [block],
 
  381                 KLU_ltsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
 
  402                     for (k = 0 ; k < n ; k++)
 
  404                         Bz  [Pnum [k]] = X [k] ;
 
  410                     for (k = 0 ; k < n ; k++)
 
  414                         Bz  [i + d  ] = X [2*k + 1] ;
 
  420                     for (k = 0 ; k < n ; k++)
 
  424                         Bz  [i + d  ] = X [3*k + 1] ;
 
  425                         Bz  [i + d*2] = X [3*k + 2] ;
 
  431                     for (k = 0 ; k < n ; k++)
 
  435                         Bz  [i + d  ] = X [4*k + 1] ;
 
  436                         Bz  [i + d*2] = X [4*k + 2] ;
 
  437                         Bz  [i + d*3] = X [4*k + 3] ;
 
  451                     for (k = 0 ; k < n ; k++)
 
  453                         SCALE_DIV_ASSIGN (Bz [Pnum [k]], X [k], Rs [k]) ;
 
  459                     for (k = 0 ; k < n ; k++)
 
  463                         SCALE_DIV_ASSIGN (Bz [i], X [2*k], rs) ;
 
  464                         SCALE_DIV_ASSIGN (Bz [i + d], X [2*k + 1], rs) ;
 
  470                     for (k = 0 ; k < n ; k++)
 
  474                         SCALE_DIV_ASSIGN (Bz [i], X [3*k], rs) ;
 
  475                         SCALE_DIV_ASSIGN (Bz [i + d], X [3*k + 1], rs) ;
 
  476                         SCALE_DIV_ASSIGN (Bz [i + d*2], X [3*k + 2], rs) ;
 
  482                     for (k = 0 ; k < n ; k++)
 
  486                         SCALE_DIV_ASSIGN (Bz [i], X [4*k], rs) ;
 
  487                         SCALE_DIV_ASSIGN (Bz [i + d], X [4*k + 1], rs) ;
 
  488                         SCALE_DIV_ASSIGN (Bz [i + d*2], X [4*k + 2], rs) ;
 
  489                         SCALE_DIV_ASSIGN (Bz [i + d*3], X [4*k + 3], rs) ;
 
  506 template <
typename Entry, 
typename Int>
 
  510     KLU_symbolic<Entry, Int> *Symbolic,
 
  511     KLU_numeric<Entry, Int> *Numeric,
 
  526     KLU_common<Entry, Int> *Common
 
  529     Entry x [4], offik, s ;
 
  531     Entry *Offx, *X, *Bz, *Udiag ;
 
  532     Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
 
  534     Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
 
  544     if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
 
  545         B == NULL || Xout == NULL)
 
  547         Common->status = KLU_INVALID ;
 
  550     Common->status = KLU_OK ;
 
  558     nblocks = Symbolic->nblocks ;
 
  566     ASSERT (nblocks == Numeric->nblocks) ;
 
  567     Pnum = Numeric->Pnum ;
 
  568     Offp = Numeric->Offp ;
 
  569     Offi = Numeric->Offi ;
 
  570     Offx = (Entry *) Numeric->Offx ;
 
  573     Llen = Numeric->Llen ;
 
  575     Ulen = Numeric->Ulen ;
 
  576     LUbx = (Unit **) Numeric->LUbx ;
 
  577     Udiag = (Entry *) Numeric->Udiag ;
 
  580     X = (Entry *) Numeric->Xwork ;
 
  581     ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
 
  587     for (chunk = 0 ; chunk < nrhs ; chunk += 4)
 
  594         nr = MIN (nrhs - chunk, 4) ;
 
  605                 for (k = 0 ; k < n ; k++)
 
  613                 for (k = 0 ; k < n ; k++)
 
  617                     X [2*k + 1] = Bz [i + d  ] ;
 
  623                 for (k = 0 ; k < n ; k++)
 
  627                     X [3*k + 1] = Bz [i + d  ] ;
 
  628                     X [3*k + 2] = Bz [i + d*2] ;
 
  634                 for (k = 0 ; k < n ; k++)
 
  638                     X [4*k + 1] = Bz [i + d  ] ;
 
  639                     X [4*k + 2] = Bz [i + d*2] ;
 
  640                     X [4*k + 3] = Bz [i + d*3] ;
 
  650         for (block = 0 ; block < nblocks ; block++)
 
  660             PRINTF ((
"tsolve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
 
  673                         for (k = k1 ; k < k2 ; k++)
 
  676                             for (p = Offp [k] ; p < pend ; p++)
 
  681                                     MULT_SUB_CONJ (X [k], X [Offi [p]],
 
  687                                     MULT_SUB (X [k], Offx [p], X [Offi [p]]) ;
 
  695                         for (k = k1 ; k < k2 ; k++)
 
  699                             x [1] = X [2*k + 1] ;
 
  700                             for (p = Offp [k] ; p < pend ; p++)
 
  706                                     CONJ (offik, Offx [p]) ;
 
  713                                 MULT_SUB (x [0], offik, X [2*i]) ;
 
  714                                 MULT_SUB (x [1], offik, X [2*i + 1]) ;
 
  717                             X [2*k + 1] = x [1] ;
 
  723                         for (k = k1 ; k < k2 ; k++)
 
  727                             x [1] = X [3*k + 1] ;
 
  728                             x [2] = X [3*k + 2] ;
 
  729                             for (p = Offp [k] ; p < pend ; p++)
 
  735                                     CONJ (offik, Offx [p]) ;
 
  742                                 MULT_SUB (x [0], offik, X [3*i]) ;
 
  743                                 MULT_SUB (x [1], offik, X [3*i + 1]) ;
 
  744                                 MULT_SUB (x [2], offik, X [3*i + 2]) ;
 
  747                             X [3*k + 1] = x [1] ;
 
  748                             X [3*k + 2] = x [2] ;
 
  754                         for (k = k1 ; k < k2 ; k++)
 
  758                             x [1] = X [4*k + 1] ;
 
  759                             x [2] = X [4*k + 2] ;
 
  760                             x [3] = X [4*k + 3] ;
 
  761                             for (p = Offp [k] ; p < pend ; p++)
 
  767                                     CONJ(offik, Offx [p]) ;
 
  774                                 MULT_SUB (x [0], offik, X [4*i]) ;
 
  775                                 MULT_SUB (x [1], offik, X [4*i + 1]) ;
 
  776                                 MULT_SUB (x [2], offik, X [4*i + 2]) ;
 
  777                                 MULT_SUB (x [3], offik, X [4*i + 3]) ;
 
  780                             X [4*k + 1] = x [1] ;
 
  781                             X [4*k + 2] = x [2] ;
 
  782                             X [4*k + 3] = x [3] ;
 
  797                     CONJ (s, Udiag [k1]) ;
 
  808                         DIV (X [k1], X [k1], s) ;
 
  812                         DIV (X [2*k1], X [2*k1], s) ;
 
  813                         DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
 
  817                         DIV (X [3*k1], X [3*k1], s) ;
 
  818                         DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
 
  819                         DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
 
  823                         DIV (X [4*k1], X [4*k1], s) ;
 
  824                         DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
 
  825                         DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
 
  826                         DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
 
  833                 KLU_utsolve (nk, Uip + k1, Ulen + k1, LUbx [block],
 
  839                 KLU_ltsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
 
  861                     for (k = 0 ; k < n ; k++)
 
  863                         Xout  [Pnum [k]] = X [k] ;
 
  869                     for (k = 0 ; k < n ; k++)
 
  872                         Xout  [i      ] = X [2*k    ] ;
 
  873                         Xout  [i + d  ] = X [2*k + 1] ;
 
  879                     for (k = 0 ; k < n ; k++)
 
  882                         Xout  [i      ] = X [3*k    ] ;
 
  883                         Xout  [i + d  ] = X [3*k + 1] ;
 
  884                         Xout  [i + d*2] = X [3*k + 2] ;
 
  890                     for (k = 0 ; k < n ; k++)
 
  893                         Xout  [i      ] = X [4*k    ] ;
 
  894                         Xout  [i + d  ] = X [4*k + 1] ;
 
  895                         Xout  [i + d*2] = X [4*k + 2] ;
 
  896                         Xout  [i + d*3] = X [4*k + 3] ;
 
  910                     for (k = 0 ; k < n ; k++)
 
  912                         SCALE_DIV_ASSIGN (Xout [Pnum [k]], X [k], Rs [k]) ;
 
  918                     for (k = 0 ; k < n ; k++)
 
  922                         SCALE_DIV_ASSIGN (Xout [i], X [2*k], rs) ;
 
  923                         SCALE_DIV_ASSIGN (Xout [i + d], X [2*k + 1], rs) ;
 
  929                     for (k = 0 ; k < n ; k++)
 
  933                         SCALE_DIV_ASSIGN (Xout [i], X [3*k], rs) ;
 
  934                         SCALE_DIV_ASSIGN (Xout [i + d], X [3*k + 1], rs) ;
 
  935                         SCALE_DIV_ASSIGN (Xout [i + d*2], X [3*k + 2], rs) ;
 
  941                     for (k = 0 ; k < n ; k++)
 
  945                         SCALE_DIV_ASSIGN (Xout [i], X [4*k], rs) ;
 
  946                         SCALE_DIV_ASSIGN (Xout [i + d], X [4*k + 1], rs) ;
 
  947                         SCALE_DIV_ASSIGN (Xout [i + d*2], X [4*k + 2], rs) ;
 
  948                         SCALE_DIV_ASSIGN (Xout [i + d*3], X [4*k + 3], rs) ;