diff -pruN 6.1.0+dfsg1-1/CMakeLists.txt 6.1.1+dfsg1-1/CMakeLists.txt
--- 6.1.0+dfsg1-1/CMakeLists.txt	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/CMakeLists.txt	2019-02-08 16:30:10.000000000 +0000
@@ -11,7 +11,7 @@ cmake_minimum_required(VERSION 2.8.12 FA
 project(SuperLU_DIST C CXX)
 set(VERSION_MAJOR "6")
 set(VERSION_MINOR "1")
-set(VERSION_BugFix "0")
+set(VERSION_BugFix "1")
 set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix})
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff -pruN 6.1.0+dfsg1-1/debian/changelog 6.1.1+dfsg1-1/debian/changelog
--- 6.1.0+dfsg1-1/debian/changelog	2018-12-11 05:49:38.000000000 +0000
+++ 6.1.1+dfsg1-1/debian/changelog	2019-02-10 00:48:14.000000000 +0000
@@ -1,3 +1,11 @@
+superlu-dist (6.1.1+dfsg1-1) unstable; urgency=medium
+
+  * New upstream release.
+  * Standards-Version: 4.3.0
+  * debhelper compatibility level 12
+
+ -- Drew Parsons <dparsons@debian.org>  Sun, 10 Feb 2019 11:48:14 +1100
+
 superlu-dist (6.1.0+dfsg1-1) unstable; urgency=medium
 
   * New upstream release.
diff -pruN 6.1.0+dfsg1-1/debian/compat 6.1.1+dfsg1-1/debian/compat
--- 6.1.0+dfsg1-1/debian/compat	2018-12-11 05:49:38.000000000 +0000
+++ 6.1.1+dfsg1-1/debian/compat	1970-01-01 00:00:00.000000000 +0000
@@ -1 +0,0 @@
-11
diff -pruN 6.1.0+dfsg1-1/debian/control 6.1.1+dfsg1-1/debian/control
--- 6.1.0+dfsg1-1/debian/control	2018-12-11 05:49:38.000000000 +0000
+++ 6.1.1+dfsg1-1/debian/control	2019-02-10 00:48:14.000000000 +0000
@@ -2,13 +2,13 @@ Source: superlu-dist
 Priority: optional
 Maintainer: Debian Science Maintainers <debian-science-maintainers@lists.alioth.debian.org>
 Uploaders: Drew Parsons <dparsons@debian.org>
-Build-Depends: debhelper (>= 11),
+Build-Depends: debhelper-compat (= 12),
 	cmake (>= 2.8.12),
 	mpi-default-dev, mpi-default-bin,
 	gfortran,
 	libblas-dev | libopenblas-dev | libatlas-base-dev | libblas.so,
 	libcombblas-dev (>= 1.6.2-3)
-Standards-Version: 4.2.1
+Standards-Version: 4.3.0
 Section: libs
 Homepage: http://crd-legacy.lbl.gov/~xiaoye/SuperLU/#superlu_dist
 Vcs-Git: https://salsa.debian.org/science-team/superlu-dist.git
diff -pruN 6.1.0+dfsg1-1/run_cmake_build.sh 6.1.1+dfsg1-1/run_cmake_build.sh
--- 6.1.0+dfsg1-1/run_cmake_build.sh	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/run_cmake_build.sh	2019-02-08 16:30:10.000000000 +0000
@@ -43,10 +43,10 @@ THISHOST=`hostname -s`
 echo "host: $THISHOST"
 if [ "$THISHOST" == "ssg1" ]
 then
-  rm -fr ssg1-build; mkdir ssg1-build; cd ssg1-build;
-  export PARMETIS_ROOT=~/lib/static/parmetis-4.0.3 
-#  rm -fr int64-build; mkdir int64-build; cd int64-build;
-#  export PARMETIS_ROOT=~/lib/static/64-bit/parmetis-4.0.3 
+#  rm -fr ssg1-build; mkdir ssg1-build; cd ssg1-build;
+#  export PARMETIS_ROOT=~/lib/static/parmetis-4.0.3 
+  rm -fr int64-build; mkdir int64-build; cd int64-build;
+  export PARMETIS_ROOT=~/lib/static/64-bit/parmetis-4.0.3 
   export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64
   echo "ParMetis root: $PARMETIS_ROOT"
   cmake .. \
@@ -62,9 +62,9 @@ OOT}/Applications/BipartiteMatchings" \
     -DTPL_ENABLE_COMBBLASLIB=OFF \
     -DTPL_ENABLE_LAPACKLIB=OFF \
     -DBUILD_SHARED_LIBS=OFF \
+    -DXSDK_INDEX_SIZE=64 \
     -DCMAKE_INSTALL_PREFIX=.
 fi
-#    -DXSDK_INDEX_SIZE=64 \
 #   -DTPL_ENABLE_PARMETISLIB=OFF
 #    -DCMAKE_CXX_FLAGS="-std=c++11" \
 
diff -pruN 6.1.0+dfsg1-1/SRC/ddistribute.c 6.1.1+dfsg1-1/SRC/ddistribute.c
--- 6.1.0+dfsg1-1/SRC/ddistribute.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/ddistribute.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Distribute the matrix onto the 2D process mesh.
  *
  * <pre>
@@ -28,10 +28,10 @@ at the top-level directory.
  * Purpose
  * =======
  *   Distribute the matrix onto the 2D process mesh.
- * 
+ *
  * Arguments
  * =========
- * 
+ *
  * fact (input) fact_t
  *        Specifies whether or not the L and U structures will be re-used.
  *        = SamePattern_SameRowPerm: L and U structures are input, and
@@ -60,22 +60,22 @@ at the top-level directory.
  */
 
 float
-ddistribute(fact_t fact, int_t n, SuperMatrix *A, 
+ddistribute(fact_t fact, int_t n, SuperMatrix *A,
             Glu_freeable_t *Glu_freeable,
 	    LUstruct_t *LUstruct, gridinfo_t *grid)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
-    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, 
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1,
           len, len1, nsupc;
 	int_t lib;  /* local block row number */
-	int_t nlb;  /* local block rows*/		  
+	int_t nlb;  /* local block rows*/
     int_t ljb;  /* local block column number */
     int_t nrbl; /* number of L blocks in current block column */
     int_t nrbu; /* number of U blocks in current block column */
     int_t gb;   /* global block number; 0 < gb <= nsuper */
     int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-	int_t ub,gik,iklrow,fnz;    
+	int_t ub,gik,iklrow,fnz;
 	int iam, jbrow, kcol, krow, mycol, myrow, pc, pr;
     int_t mybufmax[NBUFFERS];
     NCPformat *Astore;
@@ -83,31 +83,31 @@ ddistribute(fact_t fact, int_t n, SuperM
     int_t *asub;
     int_t *xa_begin, *xa_end;
     int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-    int_t *supno = Glu_persist->supno;   
+    int_t *supno = Glu_persist->supno;
     int_t *lsub, *xlsub, *usub, *usub1, *xusub;
     int_t nsupers;
     int_t next_lind;      /* next available position in index[*] */
     int_t next_lval;      /* next available position in nzval[*] */
     int_t *index;         /* indices consist of headers and row subscripts */
-	int_t *index_srt;         /* indices consist of headers and row subscripts */     
+	int_t *index_srt;         /* indices consist of headers and row subscripts */
 	int   *index1;        /* temporary pointer to array of int */
     double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
     double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-	int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */		    
-	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */			
+	int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */
     double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
     int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
 	BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
 	RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
 	BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */	
+	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
 	int msgsize;
 
     int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
     Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */  	
-	
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+
     /*-- Counts to be used in factorization. --*/
     int  *ToRecv, *ToSendD, **ToSendR;
 
@@ -123,7 +123,7 @@ ddistribute(fact_t fact, int_t n, SuperM
     int_t  **bsendx_plist; /* Column process list to send down Xk.   */
     int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
     int_t  nbsendx = 0;    /* Number of Xk I will send               */
-    int_t  *ilsum;         /* starting position of each supernode in 
+    int_t  *ilsum;         /* starting position of each supernode in
 			      the full array (local)                 */
 
     /*-- Auxiliary arrays; freed on return --*/
@@ -143,7 +143,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 	int_t *idxs;
 	int_t **nzrows;
 	double rseed;
-	int rank_cnt,rank_cnt_ref,Root;    	
+	int rank_cnt,rank_cnt_ref,Root;
     double *dense, *dense_col; /* SPA */
     double zero = 0.0;
     int_t  ldaspa;     /* LDA of SPA */
@@ -154,18 +154,18 @@ ddistribute(fact_t fact, int_t n, SuperM
     int_t *frecv, *brecv, *lloc;
     double **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     double **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-    double *SeedSTD_BC,*SeedSTD_RD;				 
+    double *SeedSTD_BC,*SeedSTD_RD;
     int_t idx_indx,idx_lusup;
     int_t nbrow;
     int_t  ik, il, lk, rel, knsupc, idx_r;
     int_t  lptr1_tmp, idx_i, idx_v,m, uu;
     int_t nub;
-    int tag;		
-	
+    int tag;
+
 #if ( PRNTlevel>=1 )
     int_t nLblocks = 0, nUblocks = 0;
 #endif
-#if ( PROFlevel>=1 ) 
+#if ( PROFlevel>=1 )
     double t, t_u, t_l;
     int_t u_blks;
 #endif
@@ -214,7 +214,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
 	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
 	Unzval_br_ptr = Llu->Unzval_br_ptr;
-	Unnz = Llu->Unnz;	
+	Unnz = Llu->Unnz;
 
 	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
 
@@ -321,9 +321,9 @@ ddistribute(fact_t fact, int_t n, SuperM
 			   t_l, t_u, u_blks, nrbu);
 #endif
 
-    } else { 
+    } else {
         /* --------------------------------------------------
-         * FIRST TIME CREATING THE L AND U DATA STRUCTURE. 
+         * FIRST TIME CREATING THE L AND U DATA STRUCTURE.
          * -------------------------------------------------- */
 
 #if ( PROFlevel>=1 )
@@ -336,7 +336,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 	xlsub = Glu_freeable->xlsub;
 	usub = Glu_freeable->usub;    /* compressed U subscripts */
 	xusub = Glu_freeable->xusub;
-    
+
 	if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) )
 	    ABORT("Malloc fails for ToRecv[].");
 	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
@@ -355,12 +355,12 @@ ddistribute(fact_t fact, int_t n, SuperM
 	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
 
 	/* Pointers to the beginning of each block row of U. */
-	if ( !(Unzval_br_ptr = 
+	if ( !(Unzval_br_ptr =
                (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
 	    ABORT("Malloc fails for Unzval_br_ptr[].");
 	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
-	
+
 	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
 	    ABORT("Malloc fails for ToSendD[].");
 	for (i = 0; i < k; ++i) ToSendD[i] = NO;
@@ -393,13 +393,13 @@ ddistribute(fact_t fact, int_t n, SuperM
 		ilsum[lb + 1] = ilsum[lb] + i;
 	    }
 	}
-	
-            
+
+
 	/* ------------------------------------------------------------
 	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
 	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
 	   ------------------------------------------------------------*/
-	
+
 	/* Loop through each supernode column. */
 	for (jb = 0; jb < nsupers; ++jb) {
 	    pc = PCOL( jb, grid );
@@ -436,7 +436,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 		} /* for i ... */
 	    } /* for j ... */
 	} /* for jb ... */
-	
+
 	/* Set up the initial pointers for each block row in U. */
 	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
 	for (lb = 0; lb < nrbu; ++lb) {
@@ -494,7 +494,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 	    ABORT("Calloc fails for fmod[].");
 	if ( !(bmod = intCalloc_dist(k)) )
 	    ABORT("Calloc fails for bmod[].");
-#if ( PRNTlevel>=1 )	
+#if ( PRNTlevel>=1 )
 	mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword;
 #endif
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
@@ -506,26 +506,26 @@ ddistribute(fact_t fact, int_t n, SuperM
 	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
 	Lrowind_bc_ptr[k-1] = NULL;
 
-	if ( !(Lindval_loc_bc_ptr = 
+	if ( !(Lindval_loc_bc_ptr =
 				(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 		ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
 	Lindval_loc_bc_ptr[k-1] = NULL;
 
-	if ( !(Linv_bc_ptr = 
+	if ( !(Linv_bc_ptr =
 				(double**)SUPERLU_MALLOC(k * sizeof(double*))) ) {
 		fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
-	}  
-	if ( !(Uinv_bc_ptr = 
+	}
+	if ( !(Uinv_bc_ptr =
 				(double**)SUPERLU_MALLOC(k * sizeof(double*))) ) {
 		fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
-	}  
+	}
 	Linv_bc_ptr[k-1] = NULL;
-	Uinv_bc_ptr[k-1] = NULL;	
-	
-	if ( !(Unnz = 
+	Uinv_bc_ptr[k-1] = NULL;
+
+	if ( !(Unnz =
 			(int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
-	ABORT("Malloc fails for Unnz[].");	
-	
+	ABORT("Malloc fails for Unnz[].");
+
 	/* These lists of processes will be used for triangular solves. */
 	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
 	    ABORT("Malloc fails for fsendx_plist[].");
@@ -556,7 +556,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 		fsupc = FstBlockC( jb );
 		nsupc = SuperSize( jb );
 		ljb = LBj( jb, grid ); /* Local block number */
-		
+
 		/* Scatter A into SPA. */
 		for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){
 		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
@@ -601,7 +601,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 			    index = Ufstnz_br_ptr[lb];
 			    uval = Unzval_br_ptr[lb];
 			    fsupc1 = FstBlockC( gb+1 );
-			    if (rb_marker[lb] <= jb) { /* First time see 
+			    if (rb_marker[lb] <= jb) { /* First time see
 							  the block       */
 				rb_marker[lb] = jb + 1;
 				Urb_indptr[lb] = Urb_fstnz[lb];;
@@ -686,15 +686,15 @@ ddistribute(fact_t fact, int_t n, SuperM
 		} /* for i ... */
 
 		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-		    /* Set up the initial pointers for each block in 
+		    /* Set up the initial pointers for each block in
 		       index[] and nzval[]. */
 		    /* Add room for descriptors */
 		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-			if ( !(index = intMalloc_dist(len1)) ) 
-				ABORT("Malloc fails for index[]");												 			 
+			if ( !(index = intMalloc_dist(len1)) )
+				ABORT("Malloc fails for index[]");
 			if (!(lusup = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double))))
-				ABORT("Malloc fails for lusup[]");			
-			if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) )) 
+				ABORT("Malloc fails for lusup[]");
+			if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) ))
 				ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
 			if (!(Linv_bc_ptr[ljb] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
 				ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
@@ -713,10 +713,10 @@ ddistribute(fact_t fact, int_t n, SuperM
 			len = Lrb_length[lb];
 			Lindval_loc_bc_ptr[ljb][k] = lb;
 			Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind;
-			Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;				
+			Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;
 			Lrb_length[lb] = 0;  /* Reset vector of block length */
 			index[next_lind++] = gb; /* Descriptor */
-			index[next_lind++] = len; 
+			index[next_lind++] = len;
 			Lrb_indptr[lb] = next_lind;
 			Lrb_valptr[lb] = next_lval;
 			next_lind += len;
@@ -744,7 +744,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 			}
 		    } /* for i ... */
 			Lrowind_bc_ptr[ljb] = index;
-			Lnzval_bc_ptr[ljb] = lusup; 
+			Lnzval_bc_ptr[ljb] = lusup;
 
 
 			/* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] and Lnzval_bc_ptr[ljb] here*/
@@ -754,15 +754,15 @@ ddistribute(fact_t fact, int_t n, SuperM
 					uu=nrbl-2;
 					lloc = &Lindval_loc_bc_ptr[ljb][1];
 				}else{
-					uu=nrbl-1;	
+					uu=nrbl-1;
 					lloc = Lindval_loc_bc_ptr[ljb];
-				}	
-				quickSortM(lloc,0,uu,nrbl,0,3);	
+				}
+				quickSortM(lloc,0,uu,nrbl,0,3);
 			}
 
 
-			if ( !(index_srt = intMalloc_dist(len1)) ) 
-				ABORT("Malloc fails for index_srt[]");				
+			if ( !(index_srt = intMalloc_dist(len1)) )
+				ABORT("Malloc fails for index_srt[]");
 			if (!(lusup_srt = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double))))
 				ABORT("Malloc fails for lusup_srt[]");
 
@@ -777,26 +777,26 @@ ddistribute(fact_t fact, int_t n, SuperM
 					index_srt[idx_indx++] = index[Lindval_loc_bc_ptr[ljb][i+nrbl]+jj];
 				}
 
-				Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow; 
+				Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow;
 
 				for (jj=0;jj<nbrow;jj++){
 					k=idx_lusup;
 					k1=Lindval_loc_bc_ptr[ljb][i+nrbl*2]+jj;
-					for (j = 0; j < nsupc; ++j) {				
+					for (j = 0; j < nsupc; ++j) {
 						lusup_srt[k] = lusup[k1];
 						k += len;
 						k1 += len;
-					}	
+					}
 					idx_lusup++;
-				}				
-				Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;	
+				}
+				Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;
 			}
 
 			SUPERLU_FREE(lusup);
 			SUPERLU_FREE(index);
 
 			Lrowind_bc_ptr[ljb] = index_srt;
-			Lnzval_bc_ptr[ljb] = lusup_srt; 			
+			Lnzval_bc_ptr[ljb] = lusup_srt;
 
 			// if(ljb==0)
 			// for (jj=0;jj<nrbl*3;jj++){
@@ -805,15 +805,15 @@ ddistribute(fact_t fact, int_t n, SuperM
 			// }
 			// for (jj=0;jj<nrbl;jj++){
 			// printf("iam %5d Lindval %5d\n",iam, index[Lindval_loc_bc_ptr[ljb][jj+nrbl]]);
-			// fflush(stdout);			
+			// fflush(stdout);
 
-			// }	
+			// }
 		} else {
 		    Lrowind_bc_ptr[ljb] = NULL;
 		    Lnzval_bc_ptr[ljb] = NULL;
 			Linv_bc_ptr[ljb] = NULL;
 			Uinv_bc_ptr[ljb] = NULL;
-			Lindval_loc_bc_ptr[ljb] = NULL;			
+			Lindval_loc_bc_ptr[ljb] = NULL;
 		} /* if nrbl ... */
 #if ( PROFlevel>=1 )
 		t_l += SuperLU_timer_() - t;
@@ -823,7 +823,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 	} /* for jb ... */
 
 	/////////////////////////////////////////////////////////////////
-	
+
 	/* Set up additional pointers for the index and value arrays of U.
 	   nub is the number of local block columns. */
 	nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */
@@ -837,7 +837,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 		ABORT("Malloc fails for Ucb_valptr[]");
 	nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
 
-	/* Count number of row blocks in a block column. 
+	/* Count number of row blocks in a block column.
 	   One pass of the skeleton graph of U. */
 	for (lk = 0; lk < nlb; ++lk) {
 		usub1 = Ufstnz_br_ptr[lk];
@@ -876,20 +876,20 @@ ddistribute(fact_t fact, int_t n, SuperM
 
 				Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
 				Ucb_valptr[ljb][Urbs1[ljb]] = j;
-				
+
 				++Urbs1[ljb];
 				j += usub1[i+1];
 				i += UB_DESCRIPTOR + SuperSize( k );
 			}
 		}
-	}				
-	
+	}
+
 
-/* Count the nnzs per block column */	
+/* Count the nnzs per block column */
 	for (lb = 0; lb < nub; ++lb) {
 		Unnz[lb] = 0;
 		k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
-		knsupc = SuperSize( k );	
+		knsupc = SuperSize( k );
 		for (ub = 0; ub < Urbs[lb]; ++ub) {
 			ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */
 			i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */
@@ -903,40 +903,40 @@ ddistribute(fact_t fact, int_t n, SuperM
 				}
 			} /* for jj ... */
 		}
-	}			
-	
+	}
+
 	/////////////////////////////////////////////////////////////////
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif				
+#endif
 	/* construct the Bcast tree for L ... */
 
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
 	if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 		ABORT("Malloc fails for LBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_BC[].");	
+		ABORT("Malloc fails for SeedSTD_BC[].");
+
 
-		
 	for (i=0;i<k;i++){
-		SeedSTD_BC[i]=rand();		
+		SeedSTD_BC[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 	for (ljb = 0; ljb <k ; ++ljb) {
 		LBtree_ptr[ljb]=NULL;
-	}			
-	
+	}
+
 
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-		ABORT("Calloc fails for ActiveFlag[].");				
-	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;	
+		ABORT("Calloc fails for ActiveFlag[].");
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -952,10 +952,10 @@ ddistribute(fact_t fact, int_t n, SuperM
 			ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb);
 		} /* for j ... */
 		}
-	}			
-	
+	}
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
-		
+
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
 		pc = PCOL( jb, grid );
@@ -964,19 +964,19 @@ ddistribute(fact_t fact, int_t n, SuperM
 		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-		Root=-1; 
-		Iactive = 0;				
+		Root=-1;
+		Iactive = 0;
 		for (j=0;j<grid->nprow;++j){
 			if(ActiveFlag[j]!=3*nsupers){
 			gb = ActiveFlag[j];
 			pr = PROW( gb, grid );
 			if(gb==jb)Root=pr;
-			if(myrow==pr)Iactive=1;		
-			}					
+			if(myrow==pr)Iactive=1;
+			}
 		}
-		
 
-		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);	
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);
 
 		if(Iactive==1){
 			// printf("jb %5d damn\n",jb);
@@ -989,7 +989,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 					++rank_cnt;
 				}
-			}		
+			}
 
 			if(rank_cnt>1){
 
@@ -999,7 +999,7 @@ ddistribute(fact_t fact, int_t n, SuperM
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');  	
+				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
 				BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d');
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
@@ -1010,15 +1010,15 @@ ddistribute(fact_t fact, int_t n, SuperM
 				// fflush(stdout);
 				// }
 
-				// #if ( PRNTlevel>=1 )		
+				// #if ( PRNTlevel>=1 )
 				if(Root==myrow){
 					rank_cnt_ref=1;
 					for (j = 0; j < grid->nprow; ++j) {
-						if ( fsendx_plist[ljb][j] != EMPTY ) {	
-							++rank_cnt_ref;		
+						if ( fsendx_plist[ljb][j] != EMPTY ) {
+							++rank_cnt_ref;
 						}
 					}
-					assert(rank_cnt==rank_cnt_ref);		
+					assert(rank_cnt==rank_cnt_ref);
 
 					// printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt);
 
@@ -1027,27 +1027,27 @@ ddistribute(fact_t fact, int_t n, SuperM
 					// // printf("\n");
 				}
 				// #endif
-			}	
+			}
 		}
 		}
 	}
 
-	
+
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
 	SUPERLU_FREE(ranks);
 	SUPERLU_FREE(SeedSTD_BC);
-	
-	
+
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
-#endif			
+#endif
 
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif			
+#endif
 	/* construct the Reduce tree for L ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1076,24 +1076,24 @@ if ( !iam) printf(".. Construct Bcast tr
 	if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 		ABORT("Malloc fails for LRtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 
 	// if ( !(idxs = intCalloc_dist(nsupers)) )
-		// ABORT("Calloc fails for idxs[].");	
+		// ABORT("Calloc fails for idxs[].");
 
 	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 		// ABORT("Malloc fails for nzrows[].");
 
 	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_RD[].");	
+		ABORT("Malloc fails for SeedSTD_RD[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_RD[i]=rand();		
+		SeedSTD_RD[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 
 	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
@@ -1119,11 +1119,11 @@ if ( !iam) printf(".. Construct Bcast tr
 		LRtree_ptr[lib]=NULL;
 	}
 
-	
+
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-				
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
+
 	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		fsupc = FstBlockC( jb );
 		pc = PCOL( jb, grid );
@@ -1138,7 +1138,7 @@ if ( !iam) printf(".. Construct Bcast tr
 		}
 	}
 
-	
+
 	for (lib=0;lib<k;++lib){
 		ib = myrow+lib*grid->nprow;  /* not sure */
 		if(ib<nsupers){
@@ -1146,19 +1146,19 @@ if ( !iam) printf(".. Construct Bcast tr
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 
 			for (j=0;j<grid->npcol;++j){
 				if(ActiveFlag[j]!=-3*nsupers){
 				jb = ActiveFlag[j];
 				pc = PCOL( jb, grid );
 				if(jb==ib)Root=pc;
-				if(mycol==pc)Iactive=1;		
-				}					
+				if(mycol==pc)Iactive=1;
+				}
 			}
-		
-		
+
+
 			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2);
 
 			if(Iactive==1){
@@ -1174,7 +1174,7 @@ if ( !iam) printf(".. Construct Bcast tr
 				if(rank_cnt>1){
 
 					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-						ranks[ii] = PNUM( pr, ranks[ii], grid );		
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 					// rseed=rand();
 					// rseed=1.0;
@@ -1182,7 +1182,7 @@ if ( !iam) printf(".. Construct Bcast tr
 
 					// if(ib==0){
 
-					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');  	
+					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
 					RdTree_SetTag(LRtree_ptr[lib], RD_L,'d');
 					// }
 
@@ -1194,7 +1194,7 @@ if ( !iam) printf(".. Construct Bcast tr
 					// if(iam==15 || iam==3){
 					// printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'d'));
 					// fflush(stdout);
-					// }		
+					// }
 
 
 					// #if ( PRNTlevel>=1 )
@@ -1205,10 +1205,10 @@ if ( !iam) printf(".. Construct Bcast tr
 					// // // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 					// // printf("\n");
 					// }
-					// #endif		
+					// #endif
 				}
-			}				
-		}	
+			}
+		}
 	}
 
 	SUPERLU_FREE(mod_bit);
@@ -1217,9 +1217,9 @@ if ( !iam) printf(".. Construct Bcast tr
 
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);	
-	// SUPERLU_FREE(idxs);	 
-	SUPERLU_FREE(SeedSTD_RD);	
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
 	// for(i=0;i<nsupers;++i){
 		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 	// }
@@ -1230,11 +1230,11 @@ if ( !iam) printf(".. Construct Bcast tr
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_();
-#endif	
+#endif
 
 	/* construct the Bcast tree for U ... */
 
@@ -1242,27 +1242,27 @@ if ( !iam) printf(".. Construct Reduce t
 	if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 		ABORT("Malloc fails for UBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_BC[].");	
+		ABORT("Malloc fails for SeedSTD_BC[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_BC[i]=rand();		
+		SeedSTD_BC[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 
 	for (ljb = 0; ljb <k ; ++ljb) {
 		UBtree_ptr[ljb]=NULL;
-	}	
+	}
 
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-	
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -1279,21 +1279,21 @@ if ( !iam) printf(".. Construct Reduce t
 				pr = PROW( gb, grid );
 				ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb);
 			// printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers);
-			// fflush(stdout);								
+			// fflush(stdout);
 				//if(gb==jb)Root=pr;
 			}
-			
-			
+
+
 		}
 		pr = PROW( jb, grid ); // take care of diagonal node stored as L
 		// printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]);
 		// fflush(stdout);
-		ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb);	
+		ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb);
 		}
-	}	
-		
-		
-		
+	}
+
+
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -1304,18 +1304,18 @@ if ( !iam) printf(".. Construct Reduce t
 		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-		Root=-1; 
-		Iactive = 0;				
+		Root=-1;
+		Iactive = 0;
 		for (j=0;j<grid->nprow;++j){
 			if(ActiveFlag[j]!=-3*nsupers){
 			gb = ActiveFlag[j];
 			pr = PROW( gb, grid );
 			if(gb==jb)Root=pr;
-			if(myrow==pr)Iactive=1;		
+			if(myrow==pr)Iactive=1;
 			}
-		}						
-		
-		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);	
+		}
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);
 	// printf("jb: %5d Iactive %5d\n",jb,Iactive);
 	// fflush(stdout);
 		if(Iactive==1){
@@ -1329,7 +1329,7 @@ if ( !iam) printf(".. Construct Reduce t
 					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 					++rank_cnt;
 				}
-			}		
+			}
 	// printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt);
 	// fflush(stdout);
 			if(rank_cnt>1){
@@ -1339,42 +1339,42 @@ if ( !iam) printf(".. Construct Reduce t
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');  	
+				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
 				BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d');
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 				// fflush(stdout);
-				
+
 				if(Root==myrow){
 				rank_cnt_ref=1;
 				for (j = 0; j < grid->nprow; ++j) {
 					// printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow);
 					// fflush(stdout);
-					if ( bsendx_plist[ljb][j] != EMPTY ) {	
-						++rank_cnt_ref;		
+					if ( bsendx_plist[ljb][j] != EMPTY ) {
+						++rank_cnt_ref;
 					}
 				}
 				// printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref);
-				// fflush(stdout);								
-				assert(rank_cnt==rank_cnt_ref);		
-				}						
+				// fflush(stdout);
+				assert(rank_cnt==rank_cnt_ref);
+				}
 			}
 		}
 		}
-	}	
+	}
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);				
-	SUPERLU_FREE(SeedSTD_BC);				
-		
+	SUPERLU_FREE(ranks);
+	SUPERLU_FREE(SeedSTD_BC);
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif					
+#endif
 	/* construct the Reduce tree for U ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1403,46 +1403,46 @@ if ( !iam) printf(".. Construct Bcast tr
 	if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 		ABORT("Malloc fails for URtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 
 	// if ( !(idxs = intCalloc_dist(nsupers)) )
-		// ABORT("Calloc fails for idxs[].");	
+		// ABORT("Calloc fails for idxs[].");
 
 	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 		// ABORT("Malloc fails for nzrows[].");
 
 	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_RD[].");	
+		ABORT("Malloc fails for SeedSTD_RD[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_RD[i]=rand();		
+		SeedSTD_RD[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 
 	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		// fsupc = FstBlockC( jb );
-		// len=0;  
+		// len=0;
 		// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 			// istart = xusub[j];
 			// /* NOTE: Only the first nonzero index of the segment
 			   // is stored in usub[]. */
-			// len +=  xusub[j+1] - xusub[j];  
-		// }	
-				
+			// len +=  xusub[j+1] - xusub[j];
+		// }
+
 		// idxs[jb] = len-1;
 
 		// if(len>0){
 			// if ( !(nzrows[jb] = intMalloc_dist(len)) )
 				// ABORT("Malloc fails for nzrows[jb]");
-			
+
 			// fsupc = FstBlockC( jb );
-			
-			// len=0; 
-			
+
+			// len=0;
+
 			// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 				// istart = xusub[j];
 				// /* NOTE: Only the first nonzero index of the segment
@@ -1452,28 +1452,28 @@ if ( !iam) printf(".. Construct Bcast tr
 					// nzrows[jb][len]=irow;
 					// len++;
 				// }
-			// }	
+			// }
 			// quickSort(nzrows[jb],0,len-1,0);
 		// }
 		// else{
 			// nzrows[jb] = NULL;
 		// }
 	// }
-	
+
 
 	for (lib = 0; lib <k ; ++lib) {
 		URtree_ptr[lib]=NULL;
 	}
 
-	
+
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;	
-				
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;
+
 	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		fsupc = FstBlockC( jb );
 		pc = PCOL( jb, grid );
-		
+
 		fsupc = FstBlockC( jb );
 		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 			istart = xusub[j];
@@ -1486,17 +1486,17 @@ if ( !iam) printf(".. Construct Bcast tr
 				if ( myrow == pr ) { /* Block row ib in my process row */
 					lib = LBi( ib, grid ); /* Local block number */
 					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
-				}						
+				}
 			}
 		}
-		
+
 		pr = PROW( jb, grid );
 		if ( myrow == pr ) { /* Block row ib in my process row */
 			lib = LBi( jb, grid ); /* Local block number */
 			ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
-		}					
+		}
 	}
-		
+
 
 	for (lib=0;lib<k;++lib){
 		ib = myrow+lib*grid->nprow;  /* not sure */
@@ -1505,18 +1505,18 @@ if ( !iam) printf(".. Construct Bcast tr
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 
 			for (j=0;j<grid->npcol;++j){
 				if(ActiveFlag[j]!=3*nsupers){
 				jb = ActiveFlag[j];
 				pc = PCOL( jb, grid );
 				if(jb==ib)Root=pc;
-				if(mycol==pc)Iactive=1;		
-				}					
+				if(mycol==pc)Iactive=1;
+				}
 			}
-			
+
 			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2);
 
 			if(Iactive==1){
@@ -1532,7 +1532,7 @@ if ( !iam) printf(".. Construct Bcast tr
 				if(rank_cnt>1){
 
 					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-						ranks[ii] = PNUM( pr, ranks[ii], grid );		
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 					// rseed=rand();
 					// rseed=1.0;
@@ -1540,7 +1540,7 @@ if ( !iam) printf(".. Construct Bcast tr
 
 					// if(ib==0){
 
-					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');  	
+					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
 					RdTree_SetTag(URtree_ptr[lib], RD_U,'d');
 					// }
 
@@ -1554,10 +1554,10 @@ if ( !iam) printf(".. Construct Bcast tr
 					// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 					// printf("\n");
 					}
-					// #endif		
+					// #endif
 				}
 			}
-		}						
+		}
 	}
 	SUPERLU_FREE(mod_bit);
 	SUPERLU_FREE(brecv);
@@ -1565,24 +1565,24 @@ if ( !iam) printf(".. Construct Bcast tr
 
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);	
-	// SUPERLU_FREE(idxs);	
-	SUPERLU_FREE(SeedSTD_RD);	
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
 	// for(i=0;i<nsupers;++i){
 		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 	// }
-	// SUPERLU_FREE(nzrows);				
-		
+	// SUPERLU_FREE(nzrows);
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
-#endif						
-		
+#endif
+
 	////////////////////////////////////////////////////////
-	
-	
+
+
 	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;  
+	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
 	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
 	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
 	Llu->Unzval_br_ptr = Unzval_br_ptr;
@@ -1605,11 +1605,11 @@ if ( !iam) printf(".. Construct Reduce t
 	Llu->URtree_ptr = URtree_ptr;
 	Llu->UBtree_ptr = UBtree_ptr;
 	Llu->Linv_bc_ptr = Linv_bc_ptr;
-	Llu->Uinv_bc_ptr = Uinv_bc_ptr;	
-	Llu->Urbs = Urbs; 
-	Llu->Ucb_indptr = Ucb_indptr; 
-	Llu->Ucb_valptr = Ucb_valptr; 	
-	
+	Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+	Llu->Urbs = Urbs;
+	Llu->Ucb_indptr = Ucb_indptr;
+	Llu->Ucb_valptr = Ucb_valptr;
+
 #if ( PRNTlevel>=1 )
 	if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
 			   nLblocks, nUblocks);
@@ -1630,8 +1630,7 @@ if ( !iam) printf(".. Construct Reduce t
 	    ABORT("Malloc fails for mod_bit[].");
 
 	/* Find the maximum buffer size. */
-	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		      MPI_MAX, grid->comm);
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm);
 
 #if ( PROFlevel>=1 )
 	if ( !iam ) printf(".. 1st distribute time:\n "
diff -pruN 6.1.0+dfsg1-1/SRC/dldperm_dist.c 6.1.1+dfsg1-1/SRC/dldperm_dist.c
--- 6.1.0+dfsg1-1/SRC/dldperm_dist.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dldperm_dist.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Finds a row permutation so that the matrix has large entries on the diagonal
  *
  * <pre>
@@ -44,7 +44,7 @@ extern void mc64ad_dist(int_t*, int_t*,
  *              permuted matrix has as many entries on its diagonal as
  *              possible. The values on the diagonal are of arbitrary size.
  *              HSL subroutine MC21A/AD is used for this.
- *        = 2 : Compute a row permutation of the matrix so that the smallest 
+ *        = 2 : Compute a row permutation of the matrix so that the smallest
  *              value on the diagonal of the permuted matrix is maximized.
  *        = 3 : Compute a row permutation of the matrix so that the smallest
  *              value on the diagonal of the permuted matrix is maximized.
@@ -54,9 +54,9 @@ extern void mc64ad_dist(int_t*, int_t*,
  *              of the diagonal entries of the permuted matrix is maximized.
  *        = 5 : Compute a row permutation of the matrix so that the product
  *              of the diagonal entries of the permuted matrix is maximized
- *              and vectors to scale the matrix so that the nonzero diagonal 
- *              entries of the permuted matrix are one in absolute value and 
- *              all the off-diagonal entries are less than or equal to one in 
+ *              and vectors to scale the matrix so that the nonzero diagonal
+ *              entries of the permuted matrix are one in absolute value and
+ *              all the off-diagonal entries are less than or equal to one in
  *              absolute value.
  *        Restriction: 1 <= JOB <= 5.
  *
@@ -83,10 +83,10 @@ extern void mc64ad_dist(int_t*, int_t*,
  *        original matrix is in row j of the permuted matrix.
  *
  * u      (output) double*, of size n
- *        If job = 5, the natural logarithms of the row scaling factors. 
+ *        If job = 5, the natural logarithms of the row scaling factors.
  *
  * v      (output) double*, of size n
- *        If job = 5, the natural logarithms of the column scaling factors. 
+ *        If job = 5, the natural logarithms of the column scaling factors.
  *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
  * </pre>
  */
@@ -94,7 +94,7 @@ extern void mc64ad_dist(int_t*, int_t*,
 int
 dldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[],
 	double nzval[], int_t *perm, double u[], double v[])
-{ 
+{
     int_t i, liw, ldw, num;
     int_t *iw, icntl[10], info[10];
     double *dw;
@@ -107,7 +107,7 @@ dldperm_dist(int_t job, int_t n, int_t n
     if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]");
     ldw = 3*n + nnz;
     if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]");
-	    
+
     /* Increment one to get 1-based indexing. */
     for (i = 0; i <= n; ++i) ++colptr[i];
     for (i = 0; i < nnz; ++i) ++adjncy[i];
@@ -116,8 +116,8 @@ dldperm_dist(int_t job, int_t n, int_t n
     PrintInt10("colptr", n+1, colptr);
     PrintInt10("adjncy", nnz, adjncy);
 #endif
-	
-    /* 
+
+    /*
      * NOTE:
      * =====
      *
diff -pruN 6.1.0+dfsg1-1/SRC/dlook_ahead_update.c 6.1.1+dfsg1-1/SRC/dlook_ahead_update.c
--- 6.1.0+dfsg1-1/SRC/dlook_ahead_update.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dlook_ahead_update.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,9 +1,9 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
@@ -11,7 +11,7 @@ at the top-level directory.
 
 
 /************************************************************************/
-/*! @file 
+/*! @file
  * \brief Look-ahead update of the Schur complement.
  *
  * <pre>
@@ -22,7 +22,7 @@ at the top-level directory.
  * Modified:
  *  September 18, 2017
  *  June 1, 2018  add parallel AWPM pivoting; add back arrive_at_ublock()
- *   
+ *
  */
 
 #include <assert.h>  /* assertion doesn't work if NDEBUG is defined */
@@ -140,7 +140,7 @@ while (j < nub && perm_u[2 * j] <= k0 +
             luptr += temp_nbrow;  /* move to next block */
         }
 
-#ifdef _OPENMP        
+#ifdef _OPENMP
         int_t thread_id = omp_get_thread_num ();
 #else
         int_t thread_id = 0;
@@ -148,7 +148,7 @@ while (j < nub && perm_u[2 * j] <= k0 +
         double * tempv = bigV + ldt*ldt*thread_id;
 
         int *indirect_thread  = indirect + ldt * thread_id;
-        int *indirect2_thread = indirect2 + ldt * thread_id;        
+        int *indirect2_thread = indirect2 + ldt * thread_id;
         ib = lsub[lptr];        /* block number of L(i,k) */
         temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
 	/* assert (temp_nbrow <= nbrow); */
@@ -174,7 +174,7 @@ while (j < nub && perm_u[2 * j] <= k0 +
 	    tt_end = SuperLU_timer_();
 	    LookAheadGEMMTimer += tt_end - tt_start;
 	    tt_start = tt_end;
-	} 
+	}
 #endif
         /* Now scattering the output. */
         if (ib < jb) {    /* A(i,j) is in U. */
@@ -186,7 +186,7 @@ while (j < nub && perm_u[2 * j] <= k0 +
         } else {          /* A(i,j) is in L. */
             dscatter_l (ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
                        temp_nbrow, usub, lsub, tempv,
-                       indirect_thread, indirect2_thread, 
+                       indirect_thread, indirect2_thread,
                        Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
         }
 
@@ -229,7 +229,7 @@ while (j < nub && perm_u[2 * j] <= k0 +
         PDGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
                   U_diag_blk_send_req, tag_ub, stat, info);
 
-        pdgstrf2_timer += SuperLU_timer_() - tt1; 
+        pdgstrf2_timer += SuperLU_timer_() - tt1;
 
         /* stat->time7 += SuperLU_timer_() - ttt1; */
 
diff -pruN 6.1.0+dfsg1-1/SRC/dmemory_dist.c 6.1.1+dfsg1-1/SRC/dmemory_dist.c
--- 6.1.0+dfsg1-1/SRC/dmemory_dist.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dmemory_dist.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,9 +1,9 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
@@ -30,7 +30,7 @@ extern SuperLU_LU_stack_t stack;
 void *duser_malloc_dist(int_t bytes, int_t which_end)
 {
     void *buf;
-    
+
     if ( SuperLU_StackFull(bytes) ) return (NULL);
 
     if ( which_end == HEAD ) {
@@ -40,7 +40,7 @@ void *duser_malloc_dist(int_t bytes, int
 	stack.top2 -= bytes;
 	buf = (char*) stack.array + stack.top2;
     }
-    
+
     stack.used += bytes;
     return buf;
 }
@@ -155,7 +155,7 @@ dallocateA_dist(int_t n, int_t nnz, doub
 double *doubleMalloc_dist(int_t n)
 {
     double *buf;
-    buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double) ); 
+    buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double) );
     return (buf);
 }
 
diff -pruN 6.1.0+dfsg1-1/SRC/dmyblas2_dist.c 6.1.1+dfsg1-1/SRC/dmyblas2_dist.c
--- 6.1.0+dfsg1-1/SRC/dmyblas2_dist.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dmyblas2_dist.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Level 2 BLAS operations: solves and matvec, written in C
  *
  * <pre>
@@ -31,8 +31,8 @@ at the top-level directory.
 /*! \brief
  *
  * <pre>
- * Solves a dense UNIT lower triangular system. The unit lower 
- * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
+ * Solves a dense UNIT lower triangular system. The unit lower
+ * triangular matrix is stored in a 2D array M(1:nrow,1:ncol).
  * The solution will be returned in the rhs vector.
  * </pre>
  */
@@ -78,13 +78,13 @@ void dlsolve ( int ldm, int ncol, double
       rhs[++firstcol] = x6;
       rhs[++firstcol] = x7;
       ++firstcol;
-    
+
       for (k = firstcol; k < ncol; k++)
 	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
 	                - x2 * *Mki2++ - x3 * *Mki3++
                         - x4 * *Mki4++ - x5 * *Mki5++
 			- x6 * *Mki6++ - x7 * *Mki7++;
- 
+
       M0 += 8 * ldm + 8;
     }
 
@@ -103,11 +103,11 @@ void dlsolve ( int ldm, int ncol, double
       rhs[++firstcol] = x2;
       rhs[++firstcol] = x3;
       ++firstcol;
-    
+
       for (k = firstcol; k < ncol; k++)
 	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
 	                - x2 * *Mki2++ - x3 * *Mki3++;
- 
+
       M0 += 4 * ldm + 4;
     }
 
@@ -120,12 +120,12 @@ void dlsolve ( int ldm, int ncol, double
 
       rhs[++firstcol] = x1;
       ++firstcol;
-    
+
       for (k = firstcol; k < ncol; k++)
 	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++;
- 
+
     }
-    return;    
+    return;
 }
 
 /*! \brief
@@ -153,7 +153,7 @@ dusolve (
 
 	xj = rhs[jcol] / M[jcol + jcol*ldm]; 		/* M(jcol, jcol) */
 	rhs[jcol] = xj;
-	
+
 	for (irow = 0; irow < jcol; irow++)
 	    rhs[irow] -= xj * M[irow + jcol*ldm];	/* M(irow, jcol) */
 
@@ -173,7 +173,7 @@ dusolve (
  */
 void dmatvec (
 	int ldm,	/* in -- leading dimension of M */
-	int nrow,	/* in */ 
+	int nrow,	/* in */
 	int ncol,	/* in */
 	double *M,	/* in */
 	double *vec,	/* in */
@@ -201,15 +201,15 @@ void dmatvec (
 	vi0 = vec[firstcol++];
 	vi1 = vec[firstcol++];
 	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
+	vi3 = vec[firstcol++];
 	vi4 = vec[firstcol++];
 	vi5 = vec[firstcol++];
 	vi6 = vec[firstcol++];
-	vi7 = vec[firstcol++];	
+	vi7 = vec[firstcol++];
 
-	for (k = 0; k < nrow; k++) 
+	for (k = 0; k < nrow; k++)
 	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
-		      + vi2 * *Mki2++ + vi3 * *Mki3++ 
+		      + vi2 * *Mki2++ + vi3 * *Mki3++
 		      + vi4 * *Mki4++ + vi5 * *Mki5++
 		      + vi6 * *Mki6++ + vi7 * *Mki7++;
 
@@ -226,8 +226,8 @@ void dmatvec (
 	vi0 = vec[firstcol++];
 	vi1 = vec[firstcol++];
 	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	for (k = 0; k < nrow; k++) 
+	vi3 = vec[firstcol++];
+	for (k = 0; k < nrow; k++)
 	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
 		      + vi2 * *Mki2++ + vi3 * *Mki3++ ;
 
@@ -243,6 +243,6 @@ void dmatvec (
 
 	M0 += ldm;
     }
-    return;	
+    return;
 }
 
diff -pruN 6.1.0+dfsg1-1/SRC/dreadhb.c 6.1.1+dfsg1-1/SRC/dreadhb.c
--- 6.1.0+dfsg1-1/SRC/dreadhb.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dreadhb.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format
  *
  * <pre>
@@ -39,67 +39,67 @@ static int ParseFloatFormat(char *, int_
  * <pre>
  * Purpose
  * =======
- * 
- * Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format 
+ *
+ * Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format
  * as described below.
- * 
- * Line 1 (A72,A8) 
- *  	Col. 1 - 72   Title (TITLE) 
- *	Col. 73 - 80  Key (KEY) 
- * 
- * Line 2 (5I14) 
- * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
- * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
- * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
- * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
- *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
- *                    (including starting guesses and solution vectors 
- *		       if present) 
- *           	      (zero indicates no right-hand side data is present) 
- *
- * Line 3 (A3, 11X, 4I14) 
- *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
- * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
- * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
- *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
- *	              (equal to number of entries for assembled matrices) 
- * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
- *	              (zero in the case of assembled matrices) 
- * Line 4 (2A16, 2A20) 
- * 	Col. 1 - 16   Format for pointers (PTRFMT) 
- *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
- *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
- * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
- *
- * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
- *    	Col. 1 	      Right-hand side type: 
- *	         	  F for full storage or M for same format as matrix 
- *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
- *    	Col. 3        X if an exact solution vector(s) is supplied. 
- *	Col. 15 - 28  Number of right-hand sides (NRHS) 
- *	Col. 29 - 42  Number of row indices (NRHSIX) 
- *          	      (ignored in case of unassembled matrices) 
- *
- * The three character type field on line 3 describes the matrix type. 
- * The following table lists the permitted values for each of the three 
- * characters. As an example of the type field, RSA denotes that the matrix 
- * is real, symmetric, and assembled. 
- *
- * First Character: 
- *	R Real matrix 
- *	C Complex matrix 
- *	P Pattern only (no numerical values supplied) 
- *
- * Second Character: 
- *	S Symmetric 
- *	U Unsymmetric 
- *	H Hermitian 
- *	Z Skew symmetric 
- *	R Rectangular 
- *
- * Third Character: 
- *	A Assembled 
- *	E Elemental matrices (unassembled) 
+ *
+ * Line 1 (A72,A8)
+ *  	Col. 1 - 72   Title (TITLE)
+ *	Col. 73 - 80  Key (KEY)
+ *
+ * Line 2 (5I14)
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD)
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD)
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD)
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD)
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD)
+ *                    (including starting guesses and solution vectors
+ *		       if present)
+ *           	      (zero indicates no right-hand side data is present)
+ *
+ * Line 3 (A3, 11X, 4I14)
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE)
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW)
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL)
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO)
+ *	              (equal to number of entries for assembled matrices)
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL)
+ *	              (zero in the case of assembled matrices)
+ * Line 4 (2A16, 2A20)
+ * 	Col. 1 - 16   Format for pointers (PTRFMT)
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT)
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT)
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT)
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present
+ *    	Col. 1 	      Right-hand side type:
+ *	         	  F for full storage or M for same format as matrix
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP)
+ *    	Col. 3        X if an exact solution vector(s) is supplied.
+ *	Col. 15 - 28  Number of right-hand sides (NRHS)
+ *	Col. 29 - 42  Number of row indices (NRHSIX)
+ *          	      (ignored in case of unassembled matrices)
+ *
+ * The three character type field on line 3 describes the matrix type.
+ * The following table lists the permitted values for each of the three
+ * characters. As an example of the type field, RSA denotes that the matrix
+ * is real, symmetric, and assembled.
+ *
+ * First Character:
+ *	R Real matrix
+ *	C Complex matrix
+ *	P Pattern only (no numerical values supplied)
+ *
+ * Second Character:
+ *	S Symmetric
+ *	U Unsymmetric
+ *	H Hermitian
+ *	Z Skew symmetric
+ *	R Rectangular
+ *
+ * Third Character:
+ *	A Assembled
+ *	E Elemental matrices (unassembled)
  * </pre>
  */
 
@@ -136,12 +136,12 @@ dreadhb_dist(int iam, FILE *fp, int_t *n
 #if ( DEBUGlevel>=1 )
     if ( !iam ) printf("Matrix type %s\n", type);
 #endif
-    
-    fscanf(fp, "%14c", buf); *nrow = atoi(buf); 
-    fscanf(fp, "%14c", buf); *ncol = atoi(buf); 
-    fscanf(fp, "%14c", buf); *nonz = atoi(buf); 
-    fscanf(fp, "%14c", buf); tmp = atoi(buf);   
-    
+
+    fscanf(fp, "%14c", buf); *nrow = atoi(buf);
+    fscanf(fp, "%14c", buf); *ncol = atoi(buf);
+    fscanf(fp, "%14c", buf); *nonz = atoi(buf);
+    fscanf(fp, "%14c", buf); tmp = atoi(buf);
+
     if (tmp != 0)
 	if ( !iam ) printf("This is not an assembled matrix!\n");
     if (*nrow != *ncol)
@@ -161,7 +161,7 @@ dreadhb_dist(int iam, FILE *fp, int_t *n
     fscanf(fp, "%20c", buf);
     DumpLine(fp);
 
-    /* Line 5: right-hand side */    
+    /* Line 5: right-hand side */
     if ( rhscrd ) DumpLine(fp); /* skip RHSFMT */
 
 #if ( DEBUGlevel>=1 )
@@ -172,7 +172,7 @@ dreadhb_dist(int iam, FILE *fp, int_t *n
 	printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize);
     }
 #endif
-    
+
     ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
 #if ( DEBUGlevel>=1 )
     if ( !iam )	printf("read colptr[" IFMT "] = " IFMT "\n", *ncol, (*colptr)[*ncol]);
@@ -212,20 +212,20 @@ static int ParseIntFormat(char *buf, int
 
     tmp = buf;
     while (*tmp++ != '(') ;
-    *num = atoi(tmp); 
+    *num = atoi(tmp);
     while (*tmp != 'I' && *tmp != 'i') ++tmp;
     ++tmp;
-    *size = atoi(tmp); 
+    *size = atoi(tmp);
     return 0;
 }
 
 static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
 {
     char *tmp, *period;
-    
+
     tmp = buf;
     while (*tmp++ != '(') ;
-    *num = atoi(tmp); 
+    *num = atoi(tmp);
     while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
 	   && *tmp != 'F' && *tmp != 'f') {
        /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
@@ -241,7 +241,7 @@ static int ParseFloatFormat(char *buf, i
     period = tmp;
     while (*period != '.' && *period != ')') ++period ;
     *period = '\0';
-    *size = atoi(tmp); 
+    *size = atoi(tmp);
 
     return 0;
 }
@@ -251,14 +251,14 @@ ReadVector(FILE *fp, int_t n, int_t *whe
 {
     register int_t i, j, item;
     char tmp, buf[100];
-    
+
     i = 0;
     while (i < n) {
 	fgets(buf, 100, fp);    /* read a line at a time */
 	for (j=0; j<perline && i<n; j++) {
 	    tmp = buf[(j+1)*persize];     /* save the char at that place */
 	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    item = atoi(&buf[j*persize]); 
+	    item = atoi(&buf[j*persize]);
 	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
 	    where[i++] = item - 1;
 	}
@@ -266,12 +266,12 @@ ReadVector(FILE *fp, int_t n, int_t *whe
 }
 
 void
-dReadValues(FILE *fp, int_t n, double *destination, 
+dReadValues(FILE *fp, int_t n, double *destination,
              int_t perline, int_t persize)
 {
     register int_t i, j, k, s;
     char tmp, buf[100];
-    
+
     i = 0;
     while (i < n) {
 	fgets(buf, 100, fp);    /* read a line at a time */
@@ -343,7 +343,7 @@ FormFullA(int_t n, int_t *nonz, double *
 	ABORT("SUPERLU_MALLOC fails for a_rowind[]");
     if ( !(a_val = (double*) SUPERLU_MALLOC( new_nnz * sizeof(double)) ) )
 	ABORT("SUPERLU_MALLOC fails for a_val[]");
-    
+
     a_colptr[0] = 0;
     k = 0;
     for (j = 0; j < n; ++j) {
@@ -368,7 +368,7 @@ FormFullA(int_t n, int_t *nonz, double *
 #endif
 	++k;
       }
-      
+
       a_colptr[j+1] = k;
     }
 
diff -pruN 6.1.0+dfsg1-1/SRC/dreadMM.c 6.1.1+dfsg1-1/SRC/dreadMM.c
--- 6.1.0+dfsg1-1/SRC/dreadMM.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dreadMM.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,9 +1,9 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
@@ -11,8 +11,8 @@ at the top-level directory.
 
 
 
-/*! @file 
- * \brief 
+/*! @file
+ * \brief
  * Contributed by Francois-Henry Rouet.
  *
  */
@@ -55,7 +55,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *
      *    Triplet in the rest of lines: row    col    value
      */
 
-     /* 1/ read header */ 
+     /* 1/ read header */
      cs = fgets(line,512,fp);
      for (p=line; *p!='\0'; *p=tolower(*p),p++);
 
@@ -63,7 +63,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *
        printf("Invalid header (first line does not contain 5 tokens)\n");
        exit;
      }
- 
+
      if(strcmp(banner,"%%matrixmarket")) {
        printf("Invalid header (first token is not \"%%%%MatrixMarket\")\n");
        exit(-1);
@@ -165,7 +165,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *
 
 	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
 	    /*|| val[nz] == 0.*/) {
-	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n", 
+	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n",
 		    nz, row[nz], col[nz], val[nz]);
 	    exit(-1);
 	} else {
@@ -178,7 +178,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *
 	          val[nz] = val[nz-1];
 	          ++xa[col[nz]];
 	        }
-            }	
+            }
 	    ++nz;
 	}
     }
@@ -188,7 +188,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *
       printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz);
       fflush(stdout);
     }
-    
+
 
     /* Initialize the array of column pointers */
     k = 0;
@@ -199,7 +199,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *
 	jsize = xa[j];
 	xa[j] = k;
     }
-    
+
     /* Copy the triplets into the column oriented storage */
     for (nz = 0; nz < *nonz; ++nz) {
 	j = col[nz];
diff -pruN 6.1.0+dfsg1-1/SRC/dreadrb.c 6.1.1+dfsg1-1/SRC/dreadrb.c
--- 6.1.0+dfsg1-1/SRC/dreadrb.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dreadrb.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,9 +1,9 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
@@ -23,7 +23,7 @@ at the top-level directory.
  * Purpose
  * =======
  *
- * Read a DOUBLE PRECISION matrix stored in Rutherford-Boeing format 
+ * Read a DOUBLE PRECISION matrix stored in Rutherford-Boeing format
  * as described below.
  *
  * Line 1 (A72, A8)
@@ -144,7 +144,7 @@ static int ReadVector(FILE *fp, int_t n,
         for (j=0; j<perline && i<n; j++) {
             tmp = buf[(j+1)*persize];     /* save the char at that place */
             buf[(j+1)*persize] = 0;       /* null terminate */
-            item = atoi(&buf[j*persize]); 
+            item = atoi(&buf[j*persize]);
             buf[(j+1)*persize] = tmp;     /* recover the char at that place */
             where[i++] = item - 1;
         }
@@ -234,7 +234,7 @@ FormFullA(int_t n, int_t *nonz, double *
 	ABORT("SUPERLU_MALLOC fails for a_rowind[]");
     if ( !(a_val = (double*) SUPERLU_MALLOC( new_nnz * sizeof(double)) ) )
 	ABORT("SUPERLU_MALLOC fails for a_val[]");
-    
+
     a_colptr[0] = 0;
     k = 0;
     for (j = 0; j < n; ++j) {
@@ -251,7 +251,7 @@ FormFullA(int_t n, int_t *nonz, double *
 	a_val[k] = al_val[i];
 	++k;
       }
-      
+
       a_colptr[j+1] = k;
     }
 
diff -pruN 6.1.0+dfsg1-1/SRC/dreadtriple.c 6.1.1+dfsg1-1/SRC/dreadtriple.c
--- 6.1.0+dfsg1-1/SRC/dreadtriple.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dreadtriple.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,17 +1,17 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
- * \brief 
+/*! @file
+ * \brief
  *
  */
 #include <stdio.h>
@@ -39,7 +39,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int
     double *a, *val;
     int_t    *asub, *xa, *row, *col;
     int_t    zero_base = 0;
-    
+
     /* 	File format:
      *    First line:  #rows    #non-zero
      *    Triplet in the rest of lines:
@@ -96,7 +96,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int
 
 	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
 	    /*|| val[nz] == 0.*/) {
-	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n", 
+	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n",
 		    nz, row[nz], col[nz], val[nz]);
 	    exit(-1);
 	} else {
@@ -109,7 +109,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int
 	      val[nz] = val[nz-1];
 	      ++xa[col[nz]];
 	    }
-#endif	
+#endif
 	    ++nz;
 	}
     }
@@ -118,7 +118,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int
 #ifdef EXPAND_SYM
     printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
 #endif
-    
+
 
     /* Initialize the array of column pointers */
     k = 0;
@@ -129,7 +129,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int
 	jsize = xa[j];
 	xa[j] = k;
     }
-    
+
     /* Copy the triplets into the column oriented storage */
     for (nz = 0; nz < *nonz; ++nz) {
 	j = col[nz];
diff -pruN 6.1.0+dfsg1-1/SRC/dreadtriple_noheader.c 6.1.1+dfsg1-1/SRC/dreadtriple_noheader.c
--- 6.1.0+dfsg1-1/SRC/dreadtriple_noheader.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dreadtriple_noheader.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,17 +1,17 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
- * \brief 
+/*! @file
+ * \brief
  *
  */
 #include <stdio.h>
@@ -67,7 +67,7 @@ dreadtriple_noheader(FILE *fp, int_t *m,
         ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali);
 #endif
     }
-    
+
     if ( minn == 0 ) { /* zero-based indexing */
 	zero_base = 1;
 	++(*n);
@@ -118,7 +118,7 @@ dreadtriple_noheader(FILE *fp, int_t *m,
 
 	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
 	    /*|| val[nz] == 0.*/) {
-	    fprintf(stderr, "nz %d, (%d, %d) = %e out of bound, removed\n", 
+	    fprintf(stderr, "nz %d, (%d, %d) = %e out of bound, removed\n",
 		    nz, row[nz], col[nz], val[nz]);
 	    exit(-1);
 	} else {
@@ -131,7 +131,7 @@ dreadtriple_noheader(FILE *fp, int_t *m,
 	      val[nz] = val[nz-1];
 	      ++xa[col[nz]];
 	    }
-#endif	
+#endif
 	    ++nz;
 	}
     }
@@ -140,7 +140,7 @@ dreadtriple_noheader(FILE *fp, int_t *m,
 #ifdef EXPAND_SYM
     printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
 #endif
-    
+
 
     /* Initialize the array of column pointers */
     k = 0;
@@ -151,7 +151,7 @@ dreadtriple_noheader(FILE *fp, int_t *m,
 	jsize = xa[j];
 	xa[j] = k;
     }
-    
+
     /* Copy the triplets into the column oriented storage */
     for (nz = 0; nz < *nonz; ++nz) {
 	j = col[nz];
diff -pruN 6.1.0+dfsg1-1/SRC/dscatter.c 6.1.1+dfsg1-1/SRC/dscatter.c
--- 6.1.0+dfsg1-1/SRC/dscatter.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dscatter.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,26 +1,26 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Scatter the computed blocks into LU destination.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 5.2) --
+ * -- Distributed SuperLU routine (version 6.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 1, 2014
  *
- * Modified: 
+ * Modified:
  *   September 18, 2017, enable SIMD vectorized scatter operation.
- *   
+ *
  */
 #include <math.h>
 #include "superlu_ddefs.h"
@@ -125,7 +125,7 @@ dscatter_l (
            int_t ** Lrowind_bc_ptr, double **Lnzval_bc_ptr,
            gridinfo_t * grid)
 {
-    
+
     int_t rel, i, segsize, jj;
     double *nzval;
     int_t *index = Lrowind_bc_ptr[ljb];
@@ -133,23 +133,23 @@ dscatter_l (
     int_t lptrj = BC_HEADER;
     int_t luptrj = 0;
     int_t ijb = index[lptrj];
-    
+
     while (ijb != ib)  /* Search for destination block L(i,j) */
     {
         luptrj += index[lptrj + 1];
         lptrj += LB_DESCRIPTOR + index[lptrj + 1];
         ijb = index[lptrj];
     }
-    
+
     /*
      * Build indirect table. This is needed because the indices are not sorted
      * in the L blocks.
      */
     int_t fnz = FstBlockC (ib);
-    int_t dest_nbrow; 
+    int_t dest_nbrow;
     lptrj += LB_DESCRIPTOR;
     dest_nbrow=index[lptrj - 1];
-    
+
 #if (_OPENMP>=201307)
 #pragma omp simd
 #endif
@@ -165,7 +165,7 @@ dscatter_l (
     /* can be precalculated? */
     for (i = 0; i < temp_nbrow; ++i) { /* Source index is a subset of dest. */
         rel = lsub[lptr + i] - fnz;
-        indirect2[i] =indirect_thread[rel]; 
+        indirect2[i] =indirect_thread[rel];
     }
 
     nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Destination block L(i,j) */
@@ -185,7 +185,7 @@ dscatter_l (
         }
         nzval += ldv;
     }
-    
+
 } /* dscatter_l */
 
 
@@ -299,12 +299,12 @@ gemm_division_cpu_gpu(
     /*input */
     int nbrow,              /*number of row in A matrix */
     int ldu,                /*number of k in dgemm */
-    int nstreams, 
+    int nstreams,
     int* full_u_cols,       /*array containing prefix sum of work load */
     int num_blks            /*Number of work load */
 )
 {
-    int Ngem = sp_ienv(7);  /*get_mnk_dgemm ();*/
+    int Ngem = sp_ienv_dist(7);  /*get_mnk_dgemm ();*/
     int min_gpu_col = get_cublas_nb ();
 
     // Ngem = 1000000000;
@@ -312,7 +312,7 @@ gemm_division_cpu_gpu(
        cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
        However since there is gpu latency of around 20,000 ns implying about
        200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
-       should be done in cpu to hide the latency; we Ngem =200,000/2 
+       should be done in cpu to hide the latency; we Ngem =200,000/2
      */
     int i, j;
 
@@ -431,7 +431,7 @@ gemm_division_new (int * num_streams_use
                    int num_blks  /*Number of work load */
     )
 {
-    int Ngem = sp_ienv(7); /*get_mnk_dgemm ();*/
+    int Ngem = sp_ienv_dist(7); /*get_mnk_dgemm ();*/
     int min_gpu_col = get_cublas_nb ();
 
     // Ngem = 1000000000;
@@ -439,7 +439,7 @@ gemm_division_new (int * num_streams_use
        cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
        However since there is gpu latency of around 20,000 ns implying about
        200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
-       should be done in cpu to hide the latency; we Ngem =200,000/2 
+       should be done in cpu to hide the latency; we Ngem =200,000/2
      */
     int_t i, j;
 
diff -pruN 6.1.0+dfsg1-1/SRC/dSchCompUdt-2Ddynamic.c 6.1.1+dfsg1-1/SRC/dSchCompUdt-2Ddynamic.c
--- 6.1.0+dfsg1-1/SRC/dSchCompUdt-2Ddynamic.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dSchCompUdt-2Ddynamic.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief This file contains the main loop of pdgstrf which involves rank k
  *        update of the Schur complement.
  *        Uses 2D partitioning for the scatter phase.
@@ -22,7 +22,7 @@ at the top-level directory.
  *
  * Modified:
  *   September 14, 2017
- *   - First gather U-panel, then depending on "ldu" (excluding leading zeros), 
+ *   - First gather U-panel, then depending on "ldu" (excluding leading zeros),
  *     gather only trailing columns of the L-panel corresponding to the nonzero
  *     of U-rows.
  *   - Padding zeros for nice dimensions of GEMM.
@@ -30,9 +30,9 @@ at the top-level directory.
  *  June 1, 2018  add parallel AWPM pivoting; add back arrive_at_ublock()
  */
 
-#define SCHEDULE_STRATEGY guided 
+#define SCHEDULE_STRATEGY guided
 
-/* 
+/*
  * Buffers:
  *     [ lookAhead_L_buff | Remain_L_buff ] : stores the gathered L-panel
  *                                            (A matrix in C := A*B )
@@ -58,17 +58,17 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
      tt_start = SuperLU_timer_();
 
      /* Sherry -- can this loop be threaded?? */
-     /* Loop through all blocks in L(:,k) to set up pointers to the start 
+     /* Loop through all blocks in L(:,k) to set up pointers to the start
       * of each block in the data arrays.
       *   - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
       *   - lookAheadStRow[i] := number of nonzero rows before block i
-      *   - lookAhead_lptr[i] := point to the start of block i in L's index[] 
+      *   - lookAhead_lptr[i] := point to the start of block i in L's index[]
       *   - (ditto Remain_Info[i])
       */
      for (int i = 0; i < nlb; ++i) {
 	 ib = lsub[lptr];            /* Block number of L(i,k). */
 	 temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
-        
+
 	 int look_up_flag = 1; /* assume ib is outside look-up window */
 	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers );
 	      ++j) {
@@ -77,35 +77,35 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
                      break;            /* Sherry -- can exit the loop?? */
                  }
 	 }
-	 
+
 	 if ( look_up_flag == 0 ) { /* ib is within look-up window */
 	     if (lookAheadBlk==0) {
 		 lookAheadFullRow[lookAheadBlk] = temp_nbrow;
 	     } else {
-		 lookAheadFullRow[lookAheadBlk] = 
-		     temp_nbrow + lookAheadFullRow[lookAheadBlk-1];   
+		 lookAheadFullRow[lookAheadBlk] =
+		     temp_nbrow + lookAheadFullRow[lookAheadBlk-1];
 	     }
 	     lookAheadStRow[lookAheadBlk] = cum_nrow;
 	     lookAhead_lptr[lookAheadBlk] = lptr;
-	     lookAhead_ib[lookAheadBlk] = ib; 
+	     lookAhead_ib[lookAheadBlk] = ib;
 	     lookAheadBlk++;
 	 } else { /* ib is not in look-up window */
 	     if ( RemainBlk==0 ) {
 		 Remain_info[RemainBlk].FullRow = temp_nbrow;
 	     } else {
-		 Remain_info[RemainBlk].FullRow = 
-		     temp_nbrow + Remain_info[RemainBlk-1].FullRow;   
+		 Remain_info[RemainBlk].FullRow =
+		     temp_nbrow + Remain_info[RemainBlk-1].FullRow;
 	     }
              RemainStRow[RemainBlk] = cum_nrow;
              // Remain_lptr[RemainBlk] = lptr;
 	     Remain_info[RemainBlk].lptr = lptr;
-	     // Remain_ib[RemainBlk] = ib; 
-	     Remain_info[RemainBlk].ib = ib; 
+	     // Remain_ib[RemainBlk] = ib;
+	     Remain_info[RemainBlk].ib = ib;
 	     RemainBlk++;
 	 }
-	 
+
          cum_nrow += temp_nbrow;
-	 
+
 	 lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
 	 lptr += temp_nbrow;     /* Move to next block */
 	 luptr += temp_nbrow;
@@ -140,7 +140,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	 ncols = 0; /* Total number of nonzero columns in U(k,:) */
 	 int temp_ncols = 0;
 
-	 /* jj0 contains the look-ahead window that was updated in 
+	 /* jj0 contains the look-ahead window that was updated in
 	    dlook_ahead_update.c. Now the search can continue from that point,
 	    not to start from block 0. */
 #if 0 // Sherry comment out 5/21/208
@@ -151,8 +151,8 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 #endif
 
 	 /* if ( iam==0 ) printf("--- k0 %d, k %d, jj0 %d, nub %d\n", k0, k, jj0, nub);*/
-	     
-         /* 
+
+         /*
 	  * Loop through all blocks in U(k,:) to set up pointers to the start
           * of each block in the data arrays, store them in Ublock_info[j]
           * for block U(k,j).
@@ -177,7 +177,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 
 	     /* if ( iam==0 )
 		 printf("j %d: Ublock_info[j].iukp %d, Ublock_info[j].rukp %d,"
-			"Ublock_info[j].jb %d, nsupc %d\n", 
+			"Ublock_info[j].jb %d, nsupc %d\n",
 			j, Ublock_info[j].iukp, Ublock_info[j].rukp,
 			Ublock_info[j].jb, nsupc); */
 
@@ -208,7 +208,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	 for ( j = jj0+1; j < nub; ++j) {
 	     Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
 	 }
-            
+
 	 /* Padding zeros to make {m,n,k} multiple of vector length. */
 	 jj = 8; //n;
 	 if (gemm_padding > 0 && Rnbrow > jj && ncols > jj && ldu > jj) {
@@ -217,11 +217,11 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	     //gemm_n_pad = ncols;
 	     //gemm_k_pad = ldu + (ldu % GEMM_PADLEN);
 	     gemm_k_pad = ldu;
-	     
+
 	     for (i = Rnbrow; i < gemm_m_pad; ++i)  // padding A matrix
 		 for (j = 0; j < gemm_k_pad; ++j)
 		     Remain_L_buff[i + j*gemm_m_pad] = zero;
-	     for (i = 0; i < Rnbrow; ++i)         
+	     for (i = 0; i < Rnbrow; ++i)
 		 for (j = ldu; j < gemm_k_pad; ++j)
 		     Remain_L_buff[i + j*gemm_m_pad] = zero;
 	     for (i = ldu; i < gemm_k_pad; ++i)     // padding B matrix
@@ -235,7 +235,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	     gemm_n_pad = ncols;
 	     gemm_k_pad = ldu;
 	 }
-     
+
 	 tempu = bigU; /* buffer the entire row block U(k,:) */
 
          /* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
@@ -261,7 +261,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	    jb = Ublock_info[j].jb;
 	    nsupc = SuperSize (jb );
 #endif
-            /* Copy from U(k,j) to tempu[], padding zeros.  */            
+            /* Copy from U(k,j) to tempu[], padding zeros.  */
             for (jj = iukp; jj < iukp+nsupc; ++jj) {
                 segsize = klst - usub[jj];
                 if ( segsize ) {
@@ -271,7 +271,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 #if (_OPENMP>=201307)
 #pragma omp simd
 #endif
-		    for (i = 0; i < segsize; ++i) 
+		    for (i = 0; i < segsize; ++i)
                     	tempu[i+lead_zero] = uval[rukp+i];
                     rukp += segsize;
                     tempu += gemm_k_pad;
@@ -310,12 +310,12 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	     StRowDest   = lookAheadFullRow[i-1];
 	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
 	 }
-	 
+
 	 int StRowSource = lookAheadStRow[i];
-	 
+
 	 /* Now copying one block into L lookahead buffer */
 	 /* #pragma omp parallel for (gives slow down) */
-	 // for (int j = 0; j < knsupc; ++j) { 
+	 // for (int j = 0; j < knsupc; ++j) {
 	 for (j = knsupc-ldu; j < knsupc; ++j) { /* skip leading columns
 						    corresponding to zero U rows */
 #if 1
@@ -386,7 +386,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
       * Perform GEMM (look-ahead L part, and remain L part) followed by Scatter
       *************************************************************************/
      tempu = bigU;  /* setting to the start of padded U(k,:) */
-    
+
      if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
 	 /***************************************************************
 	  * Updating blocks in look-ahead window of the LU(look-ahead-rows,:)
@@ -404,7 +404,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 #pragma omp parallel default (shared) private(thread_id)
 	 {
 	   thread_id = omp_get_thread_num();
- 
+
 	   /* Ideally, should organize the loop as:
 	      for (j = 0; j < nub; ++j) {
 	          for (lb = 0; lb < lookAheadBlk; ++lb) {
@@ -428,7 +428,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	   int* indirect_thread    = indirect;
 	   int* indirect2_thread   = indirect2;
 #endif
-	   /* Each thread is assigned one loop index ij, responsible for 
+	   /* Each thread is assigned one loop index ij, responsible for
 	      block update L(lb,k) * U(k,j) -> tempv[]. */
 	   for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
 	       /* jj0 starts after look-ahead window. */
@@ -449,7 +449,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
                 st_col = Ublock_info[j-1].full_u_cols;
             } else {
                 ncols  = Ublock_info[j].full_u_cols;
-                st_col = 0;   
+                st_col = 0;
             }
 
             /* Getting L block L(i,k) information */
@@ -474,7 +474,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	    gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
 #endif
 
-#if defined (USE_VENDOR_BLAS)            
+#if defined (USE_VENDOR_BLAS)
             dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
 		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
 		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
@@ -510,7 +510,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	    __itt_resume(); // start VTune, again use 2 underscores
 #endif
                 dscatter_l (
-				 ib, ljb, 
+				 ib, ljb,
 				 nsupc, iukp, xsup,
  				 klst, temp_nbrow,
 				 lptr, temp_nbrow,
@@ -527,7 +527,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
             }
 
 #if ( PRNTlevel>=1 )
-	    if (thread_id == 0) 
+	    if (thread_id == 0)
 		LookAheadScatterTimer += SuperLU_timer_() - tt_start;
 #endif
 	   } /* end omp for ij = ... */
@@ -597,7 +597,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 #pragma omp parallel default(shared) private(thread_id)
 	{
 	    thread_id = omp_get_thread_num();
- 
+
 	    /* Ideally, should organize the loop as:
                for (j = 0; j < jj_cpu; ++j) {
 	           for (lb = 0; lb < RemainBlk; ++lb) {
@@ -621,7 +621,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	    int* indirect_thread = indirect;
 	    int* indirect2_thread = indirect2;
 #endif
-	    /* Each thread is assigned one loop index ij, responsible for 
+	    /* Each thread is assigned one loop index ij, responsible for
 	       block update L(lb,k) * U(k,j) -> tempv[]. */
 	    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) {
 		/* jj_cpu := nub, jj0 starts after look-ahead window. */
@@ -642,7 +642,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 		    st_col = Ublock_info[j-1].full_u_cols;
 		} else {
 		    ncols = Ublock_info[j].full_u_cols;
-		    st_col = 0;   
+		    st_col = 0;
 		}
 
 		/* Getting L block L(i,k) information */
@@ -651,9 +651,9 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 		int temp_nbrow = lsub[lptr+1];
 		lptr += LB_DESCRIPTOR;
 		int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
-		
+
 		/* tempv1 points to block(i,j) in bigV : LDA == Rnbrow */
-		//double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry 
+		//double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry
 		double* tempv1 = bigV + (st_col * gemm_m_pad + cum_nrow); /* Sherry */
 
 		// printf("[%d] .. before scatter: ib %d, jb %d, temp_nbrow %d, Rnbrow %d\n", iam, ib, jb, temp_nbrow, Rnbrow); fflush(stdout);
@@ -684,13 +684,13 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 			       grid
 			       );
 		}
-		
+
 	    } /* end omp for (int ij =...) */
-	    
+
 #ifdef _OPENMP
 	} /* end omp parallel region */
 #endif
-	
+
 #if ( PRNTlevel>=1 )
 	RemainScatterTimer += SuperLU_timer_() - tt_start;
 #endif
diff -pruN 6.1.0+dfsg1-1/SRC/dSchCompUdt-cuda.c 6.1.1+dfsg1-1/SRC/dSchCompUdt-cuda.c
--- 6.1.0+dfsg1-1/SRC/dSchCompUdt-cuda.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dSchCompUdt-cuda.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief This file contains the main loop of pdgstrf which involves
  *        rank k update of the Schur complement.
  *        Uses CUDA GPU.
@@ -48,24 +48,24 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 
     lptr = lptr0;
     luptr = luptr0;
-    
+
     nbrow= lsub[1];
     if (myrow==krow) nbrow = lsub[1]-lsub[3];
 
     if (nbrow>0) {
-        
+
         int ncol_max = SUPERLU_MIN(buffer_size/nbrow,bigu_size/ldt);
         int num_streams_used,        /*number of streams that will be used*/
         ncpu_blks;                     /*Number of CPU dgemm blks*/
 
-        int jjj, jjj_st,jjj_global;        
+        int jjj, jjj_st,jjj_global;
         for (j = jj0; j < nub; ++j) {
             arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
 	    		      iukp0,rukp0,usub,perm_u,xsup,grid );
 
-            ncols =0 ;  //initialize at 0 
+            ncols =0 ;  //initialize at 0
             jj = iukp;
-            int temp_ldu=0; 
+            int temp_ldu=0;
             for (; jj < iukp+nsupc; ++jj) {
                 segsize = klst - usub[jj];
                 if ( segsize ) {
@@ -79,8 +79,8 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
         } /* end for j = jj0..nub */
 
         jjj = jj0; /* initialization */
-            
-        // #pragma omp barrier 
+
+        // #pragma omp barrier
         while ( jjj < nub ) {
             jjj_st=jjj;
 #ifdef _OPENMP
@@ -89,21 +89,21 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
             {
                 ldu = blk_ldu[jjj_st];
                 for (j = jjj_st; j < nub ; ++j) {
-                    
+
                     /* prefix sum */
                     if (j != jjj_st) full_u_cols[j] += full_u_cols[j-1];
 
-                    ldu = SUPERLU_MAX(ldu, blk_ldu[j]);   
+                    ldu = SUPERLU_MAX(ldu, blk_ldu[j]);
 
                     /* break condition */
                     /* the number of columns that can be processed is limited by buffer size*/
                     if (full_u_cols[j]+((j+1==nub)?0:full_u_cols[j+1]) > ncol_max) {
                         break;
                     }
-                } /* end for j=jjj_st to nub */  
+                } /* end for j=jjj_st to nub */
 
                 jjj_global = SUPERLU_MIN(nub, j+1); /* Maximum value of jjj will be nub */
-                
+
                 // TAU_STATIC_TIMER_START("work_divison");
                 /* Divide CPU-GPU gemm here */
                 gemm_division_cpu_gpu(
@@ -128,8 +128,8 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
                 if(nbrow * full_u_cols[jjj_st] > buffer_size)
                     printf("%d buffer_size %d\n",nbrow*full_u_cols[jjj_st],buffer_size );
             }
-            
-            // #pragma omp barrier 
+
+            // #pragma omp barrier
             /* gathering circuit */
             assert(jjj_st<nub);
             assert(jjj-1<nub);
@@ -161,25 +161,25 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 
                 rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
 
-            } /* end for j=jjj_st to jjj */  
+            } /* end for j=jjj_st to jjj */
 
 	    if ( num_streams_used > 0 ) {
 #ifdef PI_DEBUG
 		printf("nbrow %d *ldu %d  =%d < ldt %d * max_row_size %d =%d \n",nbrow,ldu,nbrow*ldu,ldt,max_row_size,ldt*max_row_size );
 		assert(nbrow*ldu<=ldt*max_row_size);
-#endif 
+#endif
 		cudaMemcpy2DAsync(dA, nbrow*sizeof(double),
 				  &lusup[luptr+(knsupc-ldu)*nsupr],
 				  nsupr*sizeof(double), nbrow*sizeof(double),
 				  ldu, cudaMemcpyHostToDevice, streams[0]);
 	    }
-                
+
 	    for (int i = 0; i < num_streams_used; ++i) {
-		int st = (i==0) ? ncpu_blks+jjj_st : jjj_st+stream_end_col[i-1]; 
+		int st = (i==0) ? ncpu_blks+jjj_st : jjj_st+stream_end_col[i-1];
 		int st_col = full_u_cols[st-1];
 		int num_col_stream = full_u_cols[jjj_st+stream_end_col[i]-1]-full_u_cols[st-1];
 		tempu = bigU;
-                    
+
 		double *tempv1 = bigV + full_u_cols[st-1]*nbrow;
 
 		/* Following is for testing purpose */
@@ -189,51 +189,51 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 		int c_offset  = st_col * nbrow;
 		size_t B_stream_size = ldu * num_col_stream * sizeof(double);
 		size_t C_stream_size = nbrow * num_col_stream * sizeof(double);
-		
+
 		assert(ldu*(st_col+num_col_stream) < bigu_size);
 		assert(nbrow*(st_col+num_col_stream) < buffer_size);
-		
+
 		cudaMemcpyAsync(dB+b_offset, tempu+b_offset, B_stream_size,
 				cudaMemcpyHostToDevice, streams[stream_id]);
-		
+
 		cublasCheckErrors(
 				  cublasSetStream(handle[stream_id],
 						  streams[stream_id])
 				  );
-		
+
 		cublasCheckErrors(
 				  cublasDgemm(handle[stream_id],
 					      CUBLAS_OP_N, CUBLAS_OP_N,
 					      nbrow, num_col_stream, ldu,
                                               &alpha, dA, nbrow,
-					      &dB[b_offset], ldu, 
+					      &dB[b_offset], ldu,
 					      &beta, &dC[c_offset],
                                               nbrow)
 				  );
-		
+
 		checkCuda( cudaMemcpyAsync(tempv1, dC+c_offset,
 					   C_stream_size,
 					   cudaMemcpyDeviceToHost,
 					   streams[stream_id]) );
-#else 
-		if ( num_col_stream > 0 ) {   
+#else
+		if ( num_col_stream > 0 ) {
 		    my_dgemm_("N", "N", &nbrow, &num_col_stream, &ldu,
 			      &alpha, &lusup[luptr+(knsupc-ldu)*nsupr],
 			      &nsupr, tempu+ldu*st_col, &ldu, &beta,
 			      tempv1, &nbrow, 1, 1);
 		}
-		
-#endif 
-		
+
+#endif
+
 	    } /* end for i = 1 to num_streams used */
-	    
+
 	    int num_col = full_u_cols[jjj_st+ncpu_blks-1];
 	    int st_col = 0;        /*special case for cpu */
 	    tempv = bigV + nbrow * st_col;
 	    tempu = bigU;
-	    
+
 	    double tstart = SuperLU_timer_();
-#if defined (USE_VENDOR_BLAS)            
+#if defined (USE_VENDOR_BLAS)
 	    dgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
 		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
 		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow, 1, 1);
@@ -244,12 +244,12 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 #endif
 	    gemm_timer += SuperLU_timer_() -tstart;
 	    stat->ops[FACT] += 2 * nbrow * ldu * full_u_cols[jjj-1];
-	    
+
 	    // printf("after dgemm \n");
-	    
+
             /* Now scattering blocks handled by cpu */
             int temp_ncol;
-	    
+
             /* scatter first blocks which cpu has computated*/
             tstart = SuperLU_timer_();
 
@@ -264,14 +264,14 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 #endif
             {
                 int thread_id = omp_get_thread_num();
-        
+
                 int* indirect_thread = indirect + ldt*thread_id;
                 int* indirect2_thread = indirect2 + ldt*thread_id;
                 double* tempv1;
-                
+
                 if (ncpu_blks< omp_get_num_threads()) {
                     // TAU_STATIC_TIMER_START("SPECIAL_CPU_SCATTER");
-                    
+
                     for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
                         /* code */
                         #ifdef PI_DEBUG
@@ -338,7 +338,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
                                 printf("cpu scatter \n");
                                 printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
 #endif
-                                
+
                                 tempv = tempv1+cum_nrow;
 
                                 dscatter_l (
@@ -367,7 +367,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
                         /* code */
                         #ifdef PI_DEBUG
                             printf("scattering %d  block column\n",j);
-                        #endif 
+                        #endif
 
                         /* == processing each of the remaining columns == */
                         if(j==jjj_st) tempv1 = bigV;
@@ -391,7 +391,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 			    if(j==jjj_st) {
 				temp_ncol = full_u_cols[j];
 			    } else {
-				temp_ncol = full_u_cols[j]- full_u_cols[j-1];  
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];
 			    }
 			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
 #endif
@@ -432,7 +432,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 			    lptr += temp_nbrow;
 			    luptr += temp_nbrow;
 			    cum_nrow += temp_nbrow;
-			
+
 			} /* for lb ... */
 
 			luptr=luptr0;
@@ -440,7 +440,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 		}     /* else if (ncpu_blks >= omp_get_num_threads()) */
 	    }         /* parallel region */
 
-	    scatter_timer += SuperLU_timer_() - tstart; 
+	    scatter_timer += SuperLU_timer_() - tstart;
 #ifdef _OPENMP
 #pragma omp parallel							\
     private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,		\
@@ -452,7 +452,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 #endif
             {
                 int thread_id = omp_get_thread_num();
-        
+
                 int* indirect_thread = indirect + ldt*thread_id;
                 int* indirect2_thread = indirect2 + ldt*thread_id;
                 double* tempv1;
@@ -464,12 +464,12 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
                     assert(jjj_st1>jjj_st) ;
 
                     /* now scatter it */
-#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait 
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait
                     for (j = jjj_st1; j < jjj_end; ++j) {
                         /* code */
 #ifdef PI_DEBUG
 			printf("scattering %d  block column\n",j);
-#endif 
+#endif
                         /* == processing each of the remaining columns == */
 
                         if(j==jjj_st) tempv1 = bigV;
@@ -492,7 +492,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 			    if(j==jjj_st) {
 				temp_ncol = full_u_cols[j];
 			    } else {
-				temp_ncol = full_u_cols[j]- full_u_cols[j-1];  
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];
 			    }
 			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
 #endif
@@ -532,19 +532,19 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
                             lptr += temp_nbrow;
                             luptr += temp_nbrow;
                             cum_nrow += temp_nbrow;
-			    
+
                         } /* for lb ... */
 
                         luptr=luptr0;
                     } /* for j = jjj_st ... */
-                    
+
                 } /* end for i = 0 to nstreams */
                 // TAU_STATIC_TIMER_STOP("GPU_SCATTER");
                 // TAU_STATIC_TIMER_STOP("INSIDE_OMP");
             } /* end pragma omp parallel */
             // TAU_STATIC_TIMER_STOP("OUTSIDE_OMP");
         }  /* end while(jjj<nub) */
- 
+
     } /* if nbrow>0 */
 
  }   /* if msg1 and msg 2 */
diff -pruN 6.1.0+dfsg1-1/SRC/dutil_dist.c 6.1.1+dfsg1-1/SRC/dutil_dist.c
--- 6.1.0+dfsg1-1/SRC/dutil_dist.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/dutil_dist.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,20 +1,20 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Several matrix utilities
  *
  * <pre>
- * -- Distributed SuperLU routine (version 2.0) --
+ * -- Distributed SuperLU routine (version 6.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
@@ -24,7 +24,7 @@ at the top-level directory.
 #include "superlu_ddefs.h"
 
 void
-dCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz, 
+dCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz,
 			    double *nzval, int_t *rowind, int_t *colptr,
 			    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
 {
@@ -71,7 +71,7 @@ dCreate_CompRowLoc_Matrix_dist(SuperMatr
 /*! \brief Convert a row compressed storage into a column compressed storage.
  */
 void
-dCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz, 
+dCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz,
                          double *a, int_t *colind, int_t *rowptr,
                          double **at, int_t **rowind, int_t **colptr)
 {
@@ -83,7 +83,7 @@ dCompRow_to_CompCol_dist(int_t m, int_t
     *rowind = intMalloc_dist(nnz);
     *colptr = intMalloc_dist(n+1);
     marker = intCalloc_dist(n);
-    
+
     /* Get counts of each column of A, and set up column pointers */
     for (i = 0; i < m; ++i)
 	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
@@ -134,7 +134,7 @@ void dPrint_CompCol_Matrix_dist(SuperMat
     NCformat     *Astore;
     register int i;
     double       *dp;
-    
+
     printf("\nCompCol matrix: ");
     printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
     Astore = (NCformat *) A->Store;
@@ -145,10 +145,10 @@ void dPrint_CompCol_Matrix_dist(SuperMat
         for (i = 0; i < Astore->nnz; ++i) printf("%f  ", dp[i]);
     }
     printf("\nrowind:\n");
-    for (i = 0; i < Astore->nnz; ++i) 
+    for (i = 0; i < Astore->nnz; ++i)
         printf("%lld  ", (long long) Astore->rowind[i]);
     printf("\ncolptr:\n");
-    for (i = 0; i <= A->ncol; ++i) 
+    for (i = 0; i <= A->ncol; ++i)
         printf("%lld  ", (long long) Astore->colptr[i]);
     printf("\nend CompCol matrix.\n");
 }
@@ -158,12 +158,12 @@ void dPrint_Dense_Matrix_dist(SuperMatri
     DNformat     *Astore;
     register int i;
     double       *dp;
-    
+
     printf("\nDense matrix: ");
     printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
     Astore = (DNformat *) A->Store;
     dp = (double *) Astore->nzval;
-    printf("nrow %lld, ncol %lld, lda %lld\n", 
+    printf("nrow %lld, ncol %lld, lda %lld\n",
         (long long) A->nrow, (long long) A->ncol, (long long) Astore->lda);
     printf("\nnzval: ");
     for (i = 0; i < A->nrow; ++i) printf("%f  ", dp[i]);
@@ -175,14 +175,14 @@ int dPrint_CompRowLoc_Matrix_dist(SuperM
     NRformat_loc  *Astore;
     int_t  nnz_loc, m_loc;
     double  *dp;
-    
+
     printf("\n==== CompRowLoc matrix: ");
     printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
     Astore = (NRformat_loc *) A->Store;
-    printf("nrow %ld, ncol %ld\n", 
+    printf("nrow %ld, ncol %ld\n",
             (long int) A->nrow, (long int) A->ncol);
     nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
-    printf("nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc, 
+    printf("nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc,
             (long int) m_loc, (long int) Astore->fst_row);
     PrintInt10("rowptr", m_loc+1, Astore->rowptr);
     PrintInt10("colind", nnz_loc, Astore->colind);
@@ -197,7 +197,7 @@ int file_dPrint_CompRowLoc_Matrix_dist(F
     NRformat_loc     *Astore;
     int_t  nnz_loc, m_loc;
     double       *dp;
-    
+
     fprintf(fp, "\n==== CompRowLoc matrix: ");
     fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
     Astore = (NRformat_loc *) A->Store;
@@ -219,7 +219,7 @@ dCreate_Dense_Matrix_dist(SuperMatrix *X
 			  Mtype_t mtype)
 {
     DNformat    *Xstore;
-    
+
     X->Stype = stype;
     X->Dtype = dtype;
     X->Mtype = mtype;
@@ -246,14 +246,14 @@ dCopy_Dense_Matrix_dist(int_t M, int_t N
  * </pre>
  */
     int    i, j;
-    
+
     for (j = 0; j < N; ++j)
         for (i = 0; i < M; ++i)
             Y[i + j*ldy] = X[i + j*ldx];
 }
 
 void
-dCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz, 
+dCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz,
 			      double *nzval, int_t *nzval_colptr,
 			      int_t *rowind, int_t *rowind_colptr,
 			      int_t *col_to_sup, int_t *sup_to_col,
@@ -286,7 +286,7 @@ dCreate_SuperNode_Matrix_dist(SuperMatri
  *  and shape as A.
  *  The clone operation would copy all the non-pointer structure members like
  *  nrow, ncol, Stype, Dtype, Mtype from A and allocate a new nested Store
- *  structure. It would also copy nnz_loc, m_loc, fst_row from A->Store 
+ *  structure. It would also copy nnz_loc, m_loc, fst_row from A->Store
  *  into B->Store. It does not copy the matrix entries, row pointers,
  *  or column indices.
  */
@@ -317,16 +317,14 @@ void dClone_CompRowLoc_Matrix_dist(Super
     return;
 }
 
-/* \brief Copy: Call the clone operation and then copies all entries,
- *  row pointers, and column indices of a matrix into another matrix of
- *  the same type, B_{i,j}=A_{i,j}, for i,j=1,...,n
+/* \brief Copy: copies all entries, row pointers, and column indices of
+ *  a matrix into another matrix of the same type,
+ *  B_{i,j}=A_{i,j}, for i,j=1,...,n
  */
 void dCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
 {
     NRformat_loc  *Astore, *Bstore;
 
-    dClone_CompRowLoc_Matrix_dist(A, B);
-
     Astore = (NRformat_loc *) A->Store;
     Bstore = (NRformat_loc *) B->Store;
 
@@ -423,7 +421,7 @@ dFillRHS_dist(char *trans, int_t nrhs, d
 
 /*! \brief Fills a double precision array with a given value.
  */
-void 
+void
 dfill_dist(double *a, int_t alen, double dval)
 {
     register int_t i;
@@ -432,7 +430,7 @@ dfill_dist(double *a, int_t alen, double
 
 
 
-/*! \brief Check the inf-norm of the error vector 
+/*! \brief Check the inf-norm of the error vector
  */
 void dinf_norm_error_dist(int_t n, int_t nrhs, double *x, int_t ldx,
 			  double *xtrue, int_t ldxtrue,
@@ -458,7 +456,7 @@ void dinf_norm_error_dist(int_t n, int_t
 void PrintDouble5(char *name, int_t len, double *x)
 {
     register int_t i;
-    
+
     printf("%10s:", name);
     for (i = 0; i < len; ++i) {
 	if ( i % 5 == 0 ) printf("\n[%ld-%ld] ", (long int) i, (long int) i+4);
@@ -470,7 +468,7 @@ void PrintDouble5(char *name, int_t len,
 int file_PrintDouble5(FILE *fp, char *name, int_t len, double *x)
 {
     register int_t i;
-    
+
     fprintf(fp, "%10s:", name);
     for (i = 0; i < len; ++i) {
 	if ( i % 5 == 0 ) fprintf(fp, "\n[%ld-%ld] ", (long int) i, (long int) i+4);
@@ -508,7 +506,7 @@ void dPrintLblocks(int iam, int_t nsuper
 		   iam, gb, lb, nsupc, nb);
 	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
 		len = index[k+1];
-		printf("[%d] row-block %d: block # " IFMT "\tlength %d\n", 
+		printf("[%d] row-block %d: block # " IFMT "\tlength %d\n",
 		       iam, c, index[k], len);
 		PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]);
 		for (j = 0; j < nsupc; ++j) {
@@ -525,7 +523,7 @@ void dPrintLblocks(int iam, int_t nsuper
     printf("nfrecvx " IFMT "\n", Llu->nfrecvx);
     k = CEILING( nsupers, grid->nprow );
     PrintInt10("fmod", k, Llu->fmod);
-    
+
 } /* DPRINTLBLOCKS */
 
 
@@ -576,8 +574,8 @@ void dDumpLblocks(int iam, int_t nsupers
     int_t *index;
     double *nzval;
 	char filename[256];
-	FILE *fp, *fopen();	
- 
+	FILE *fp, *fopen();
+
 	// assert(grid->npcol*grid->nprow==1);
 
 	// count nonzeros in the first pass
@@ -597,27 +595,27 @@ void dDumpLblocks(int iam, int_t nsupers
 	    nsupc = SuperSize( gb );
 	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
 		len = index[k+1];
-		
+
 		for (j = 0; j < nsupc; ++j) {
 		for (i=0; i<len; ++i){
-		
+
 		if(index[k+LB_DESCRIPTOR+i]+1>=xsup[gb]+j+1){
-			nnzL ++; 
-			nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1);  
+			nnzL ++;
+			nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1);
 			n = nmax;
 		}
-		
+
 		}
 		}
 		k += LB_DESCRIPTOR + len;
 		r += len;
 	    }
-	}	
-    }	
+	}
+    }
 	MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm);
-	MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm);	
-	
-	snprintf(filename, sizeof(filename), "%s-%d", "L", iam);    
+	MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm);
+
+	snprintf(filename, sizeof(filename), "%s-%d", "L", iam);
     printf("Dumping L factor to --> %s\n", filename);
  	if ( !(fp = fopen(filename, "w")) ) {
 			ABORT("File open failed");
@@ -626,7 +624,7 @@ void dDumpLblocks(int iam, int_t nsupers
 	if(grid->iam==0){
 		fprintf(fp, "%d %d %d\n", n,n,nnzL);
 	}
-	
+
      ncb = nsupers / grid->npcol;
     extra = nsupers % grid->npcol;
     mycol = MYCOL( iam, grid );
@@ -641,29 +639,29 @@ void dDumpLblocks(int iam, int_t nsupers
 	    nsupc = SuperSize( gb );
 	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
 		len = index[k+1];
-		
+
 		for (j = 0; j < nsupc; ++j) {
 		for (i=0; i<len; ++i){
 			fprintf(fp, IFMT IFMT " %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, (double)iam);
-#if 0		
+#if 0
 			fprintf(fp, IFMT IFMT " %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, nzval[r +i+ j*nsupr]);
-#endif		
+#endif
 		}
 		}
 		k += LB_DESCRIPTOR + len;
 		r += len;
 	    }
-	}	
+	}
     }
  	fclose(fp);
- 	
+
 } /* dDumpLblocks */
 
 
 
 /*! \brief Print the blocks in the factored matrix U.
  */
-void dPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid, 
+void dPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
 {
     register int c, extra, jb, k, lb, len, nb, nrb, nsupc;
@@ -688,7 +686,7 @@ void dPrintUblocks(int iam, int_t nsuper
 	    for (c = 0, k = BR_HEADER; c < nb; ++c) {
 		jb = index[k];
 		len = index[k+1];
-		printf("[%d] col-block %d: block # %d\tlength " IFMT "\n", 
+		printf("[%d] col-block %d: block # %d\tlength " IFMT "\n",
 		       iam, c, jb, index[k+1]);
 		nsupc = SuperSize( jb );
 		PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]);
diff -pruN 6.1.0+dfsg1-1/SRC/pddistribute.c 6.1.1+dfsg1-1/SRC/pddistribute.c
--- 6.1.0+dfsg1-1/SRC/pddistribute.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pddistribute.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Re-distribute A on the 2D process mesh.
  * <pre>
  * -- Distributed SuperLU routine (version 2.3) --
@@ -20,7 +20,7 @@ at the top-level directory.
  */
 
 #include "superlu_ddefs.h"
-	  
+
 
 /*! \brief
  *
@@ -28,10 +28,10 @@ at the top-level directory.
  * Purpose
  * =======
  *   Re-distribute A on the 2D process mesh.
- * 
+ *
  * Arguments
  * =========
- * 
+ *
  * A      (input) SuperMatrix*
  *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
  *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
@@ -43,7 +43,7 @@ at the top-level directory.
  *
  * Glu_freeable (input) *Glu_freeable_t
  *        The global structure describing the graph of L and U.
- * 
+ *
  * grid   (input) gridinfo_t*
  *        The 2D process mesh.
  *
@@ -81,7 +81,7 @@ dReDistribute_A(SuperMatrix *A, ScalePer
     int    iam, it, p, procs, iam_g;
     MPI_Request *send_req;
     MPI_Status  status;
-    
+
 
     /* ------------------------------------------------------------
        INITIALIZATION.
@@ -98,8 +98,8 @@ dReDistribute_A(SuperMatrix *A, ScalePer
     m_loc = Astore->m_loc;
     fst_row = Astore->fst_row;
     nnzToRecv = intCalloc_dist(2*procs);
-    nnzToSend = nnzToRecv + procs;	
-	
+    nnzToSend = nnzToRecv + procs;
+
     /* ------------------------------------------------------------
        COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
        THEN ALLOCATE SPACE.
@@ -112,7 +112,7 @@ dReDistribute_A(SuperMatrix *A, ScalePer
 	    gbi = BlockNum( irow );
 	    gbj = BlockNum( jcol );
 	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-	    ++nnzToSend[p]; 
+	    ++nnzToSend[p];
 	}
     }
 
@@ -177,7 +177,7 @@ dReDistribute_A(SuperMatrix *A, ScalePer
 	  }
       }
     } /* if procs > 1 */
-      
+
     if ( !(*colptr = intCalloc_dist(n+1)) )
         ABORT("Malloc fails for *colptr[].");
 
@@ -200,7 +200,7 @@ dReDistribute_A(SuperMatrix *A, ScalePer
 	        ia_send[p][k] = irow;
 	        ia_send[p][k + nnzToSend[p]] = jcol;
 		aij_send[p][k] = nzval_a[j];
-		++ptr_to_send[p]; 
+		++ptr_to_send[p];
 	    } else {          /* local */
 	        ia[nnz_loc] = irow;
 	        ja[nnz_loc] = jcol;
@@ -222,14 +222,14 @@ dReDistribute_A(SuperMatrix *A, ScalePer
 		       p, iam, grid->comm, &send_req[p] );
 	    it = nnzToSend[p];
 	    MPI_Isend( aij_send[p], it, MPI_DOUBLE,
-	               p, iam+procs, grid->comm, &send_req[procs+p] ); 
+	               p, iam+procs, grid->comm, &send_req[procs+p] );
 	}
     }
 
     for (p = 0; p < procs; ++p) {
         if ( p != iam ) {
 	    it = 2*nnzToRecv[p];
-	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
+	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
 	    it = nnzToRecv[p];
             MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs,
 		      grid->comm, &status );
@@ -240,7 +240,7 @@ dReDistribute_A(SuperMatrix *A, ScalePer
 	        ja[nnz_loc] = jcol;
 		aij[nnz_loc] = dtemp[i];
 		++nnz_loc;
-		++(*colptr)[jcol]; /* Count nonzeros in each column */ 
+		++(*colptr)[jcol]; /* Count nonzeros in each column */
 	    }
 	}
     }
@@ -292,7 +292,7 @@ dReDistribute_A(SuperMatrix *A, ScalePer
 	jsize = (*colptr)[j];
 	(*colptr)[j] = k;
     }
-    
+
     /* Copy the triplets into the column oriented storage */
     for (i = 0; i < nnz_loc; ++i) {
 	j = ja[i];
@@ -314,7 +314,7 @@ dReDistribute_A(SuperMatrix *A, ScalePer
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Exit dReDistribute_A()");
 #endif
- 
+
     return 0;
 } /* dReDistribute_A */
 
@@ -332,10 +332,10 @@ pddistribute(fact_t fact, int_t n, Super
  * Purpose
  * =======
  *   Distribute the matrix onto the 2D process mesh.
- * 
+ *
  * Arguments
  * =========
- * 
+ *
  * fact (input) fact_t
  *        Specifies whether or not the L and U structures will be re-used.
  *        = SamePattern_SameRowPerm: L and U structures are input, and
@@ -356,7 +356,7 @@ pddistribute(fact_t fact, int_t n, Super
  *
  * Glu_freeable (input) *Glu_freeable_t
  *        The global structure describing the graph of L and U.
- * 
+ *
  * LUstruct (input) LUstruct_t*
  *        Data structures for L and U factors.
  *
@@ -371,7 +371,7 @@ pddistribute(fact_t fact, int_t n, Super
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
-    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, 
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1,
           len, len1, nsupc;
 	int_t lib;  /* local block row number */
 	int_t nlb;  /* local block rows*/
@@ -380,39 +380,39 @@ pddistribute(fact_t fact, int_t n, Super
     int_t nrbu; /* number of U blocks in current block column */
     int_t gb;   /* global block number; 0 < gb <= nsuper */
     int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-	int_t ub,gik,iklrow,fnz;    
+	int_t ub,gik,iklrow,fnz;
 	int iam, jbrow, kcol, krow, mycol, myrow, pc, pr;
     int_t mybufmax[NBUFFERS];
     NRformat_loc *Astore;
     double *a;
     int_t *asub, *xa;
-    int_t *xa_begin, *xa_end;							 
+    int_t *xa_begin, *xa_end;
     int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-    int_t *supno = Glu_persist->supno;   
+    int_t *supno = Glu_persist->supno;
     int_t *lsub, *xlsub, *usub, *usub1, *xusub;
     int_t nsupers;
     int_t next_lind;      /* next available position in index[*] */
     int_t next_lval;      /* next available position in nzval[*] */
     int_t *index;         /* indices consist of headers and row subscripts */
-	int_t *index_srt;         /* indices consist of headers and row subscripts */    
+	int_t *index_srt;         /* indices consist of headers and row subscripts */
 	int   *index1;        /* temporary pointer to array of int */
     double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
     double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-	int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */		    
-	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */	
+	int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */
 	double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
     int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
 
 	BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
 	RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
 	BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */	
+	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
 	int msgsize;
 
     int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
     Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */  		
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
     /*-- Counts to be used in factorization. --*/
     int  *ToRecv, *ToSendD, **ToSendR;
 
@@ -428,7 +428,7 @@ pddistribute(fact_t fact, int_t n, Super
     int_t  **bsendx_plist; /* Column process list to send down Xk.   */
     int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
     int_t  nbsendx = 0;    /* Number of Xk I will send               */
-    int_t  *ilsum;         /* starting position of each supernode in 
+    int_t  *ilsum;         /* starting position of each supernode in
 			      the full array (local)                 */
 
     /*-- Auxiliary arrays; freed on return --*/
@@ -448,30 +448,30 @@ pddistribute(fact_t fact, int_t n, Super
 	int_t *idxs;
 	int_t **nzrows;
 	double rseed;
-	int rank_cnt,rank_cnt_ref,Root;    
+	int rank_cnt,rank_cnt_ref,Root;
 	double *dense, *dense_col; /* SPA */
     double zero = 0.0;
     int_t ldaspa;     /* LDA of SPA */
     int_t iword, dword;
     float mem_use = 0.0;
     float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
-	
+
     int_t *mod_bit;
     int_t *frecv, *brecv, *lloc;
     double **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     double **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-    double *SeedSTD_BC,*SeedSTD_RD;				 
+    double *SeedSTD_BC,*SeedSTD_RD;
     int_t idx_indx,idx_lusup;
     int_t nbrow;
     int_t  ik, il, lk, rel, knsupc, idx_r;
     int_t  lptr1_tmp, idx_i, idx_v,m, uu;
     int_t nub;
-    int tag;	
-	
+    int tag;
+
 #if ( PRNTlevel>=1 )
     int_t nLblocks = 0, nUblocks = 0;
 #endif
-#if ( PROFlevel>=1 ) 
+#if ( PROFlevel>=1 )
     double t, t_u, t_l;
     int_t u_blks;
 #endif
@@ -486,7 +486,7 @@ pddistribute(fact_t fact, int_t n, Super
 
 //#if ( PRNTlevel>=1 )
     iword = sizeof(int_t);
-    dword = sizeof(double);					
+    dword = sizeof(double);
 //#endif
 
 #if ( DEBUGlevel>=1 )
@@ -522,11 +522,11 @@ pddistribute(fact_t fact, int_t n, Super
 	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
 	    ABORT("Malloc fails for Urb_indptr[].");
 	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-	Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr;											  
+	Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr;
 	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
 	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
 	Unzval_br_ptr = Llu->Unzval_br_ptr;
-	Unnz = Llu->Unnz;	
+	Unnz = Llu->Unnz;
 
 	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
 
@@ -648,7 +648,7 @@ pddistribute(fact_t fact, int_t n, Super
 	xlsub = Glu_freeable->xlsub;
 	usub = Glu_freeable->usub;    /* compressed U subscripts */
 	xusub = Glu_freeable->xusub;
-    
+
 	if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) )
 	    ABORT("Malloc fails for ToRecv[].");
 	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
@@ -667,12 +667,12 @@ pddistribute(fact_t fact, int_t n, Super
 	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
 
 	/* Pointers to the beginning of each block row of U. */
-	if ( !(Unzval_br_ptr = 
+	if ( !(Unzval_br_ptr =
               (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
 	    ABORT("Malloc fails for Unzval_br_ptr[].");
 	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
-	
+
 	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
 	    ABORT("Malloc fails for ToSendD[].");
 	for (i = 0; i < k; ++i) ToSendD[i] = NO;
@@ -705,7 +705,7 @@ pddistribute(fact_t fact, int_t n, Super
 		ilsum[lb + 1] = ilsum[lb] + i;
 	    }
 	}
-	
+
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_();
 #endif
@@ -713,7 +713,7 @@ pddistribute(fact_t fact, int_t n, Super
 	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
 	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
 	   ------------------------------------------------------------*/
-	
+
 	/* Loop through each supernode column. */
 	for (jb = 0; jb < nsupers; ++jb) {
 	    pc = PCOL( jb, grid );
@@ -750,7 +750,7 @@ pddistribute(fact_t fact, int_t n, Super
 		} /* for i ... */
 	    } /* for j ... */
 	} /* for jb ... */
-	
+
 	/* Set up the initial pointers for each block row in U. */
 	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
 	for (lb = 0; lb < nrbu; ++lb) {
@@ -814,34 +814,34 @@ pddistribute(fact_t fact, int_t n, Super
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
 
 	/* Pointers to the beginning of each block column of L. */
-	if ( !(Lnzval_bc_ptr = 
+	if ( !(Lnzval_bc_ptr =
               (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
 	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
 	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
 	Lrowind_bc_ptr[k-1] = NULL;
 
-	if ( !(Lindval_loc_bc_ptr = 
+	if ( !(Lindval_loc_bc_ptr =
 				(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 		ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
 	Lindval_loc_bc_ptr[k-1] = NULL;
 
-	if ( !(Linv_bc_ptr = 
+	if ( !(Linv_bc_ptr =
 				(double**)SUPERLU_MALLOC(k * sizeof(double*))) ) {
 		fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
-	}  
-	if ( !(Uinv_bc_ptr = 
+	}
+	if ( !(Uinv_bc_ptr =
 				(double**)SUPERLU_MALLOC(k * sizeof(double*))) ) {
 		fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
-	}  
+	}
 	Linv_bc_ptr[k-1] = NULL;
-	Uinv_bc_ptr[k-1] = NULL;	
-	
-	if ( !(Unnz = 
+	Uinv_bc_ptr[k-1] = NULL;
+
+	if ( !(Unnz =
 			(int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
 	ABORT("Malloc fails for Unnz[].");
-		
-	
+
+
 	/* These lists of processes will be used for triangular solves. */
 	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
 	    ABORT("Malloc fails for fsendx_plist[].");
@@ -861,7 +861,7 @@ pddistribute(fact_t fact, int_t n, Super
 	/* -------------------------------------------------------------- */
 	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
 	memTRS += k*sizeof(int_t*) + 2.0*k*sizeof(double*) + k*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
-	
+
 	/*------------------------------------------------------------
 	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
 	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
@@ -873,7 +873,7 @@ pddistribute(fact_t fact, int_t n, Super
 		fsupc = FstBlockC( jb );
 		nsupc = SuperSize( jb );
 		ljb = LBj( jb, grid ); /* Local block number */
-		
+
 		/* Scatter A into SPA. */
 		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
 		    for (i = xa[j]; i < xa[j+1]; ++i) {
@@ -918,7 +918,7 @@ pddistribute(fact_t fact, int_t n, Super
 			    index = Ufstnz_br_ptr[lb];
 			    uval = Unzval_br_ptr[lb];
 			    fsupc1 = FstBlockC( gb+1 );
-			    if (rb_marker[lb] <= jb) { /* First time see 
+			    if (rb_marker[lb] <= jb) { /* First time see
 							  the block       */
 				rb_marker[lb] = jb + 1;
 				Urb_indptr[lb] = Urb_fstnz[lb];;
@@ -959,7 +959,7 @@ pddistribute(fact_t fact, int_t n, Super
 #if ( PROFlevel>=1 )
 		t_u += SuperLU_timer_() - t;
 		t = SuperLU_timer_();
-#endif		
+#endif
 		/*------------------------------------------------
 		 * SET UP L BLOCKS.
 		 *------------------------------------------------*/
@@ -1002,15 +1002,15 @@ pddistribute(fact_t fact, int_t n, Super
 		} /* for i ... */
 
 		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-		    /* Set up the initial pointers for each block in 
+		    /* Set up the initial pointers for each block in
 		       index[] and nzval[]. */
 		    /* Add room for descriptors */
 		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-		    if ( !(index = intMalloc_dist(len1)) ) 
+		    if ( !(index = intMalloc_dist(len1)) )
 			ABORT("Malloc fails for index[]");
 		    if (!(lusup = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double))))
 			ABORT("Malloc fails for lusup[]");
-		    if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) ) 
+		    if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) )
 			ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
   		    if (!(Linv_bc_ptr[ljb] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
 			ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
@@ -1019,7 +1019,7 @@ pddistribute(fact_t fact, int_t n, Super
 		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
 		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
 		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-	  	    memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]			
+	  	    memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
 		    index[0] = nrbl;  /* Number of row blocks */
 		    index[1] = len;   /* LDA of the nzval[] */
 		    next_lind = BC_HEADER;
@@ -1030,10 +1030,10 @@ pddistribute(fact_t fact, int_t n, Super
 			len = Lrb_length[lb];
 			Lindval_loc_bc_ptr[ljb][k] = lb;
 			Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind;
-			Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;																	 
+			Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;
 			Lrb_length[lb] = 0;  /* Reset vector of block length */
 			index[next_lind++] = gb; /* Descriptor */
-			index[next_lind++] = len; 
+			index[next_lind++] = len;
 			Lrb_indptr[lb] = next_lind;
 			Lrb_valptr[lb] = next_lval;
 			next_lind += len;
@@ -1059,9 +1059,9 @@ pddistribute(fact_t fact, int_t n, Super
 			    }
 			}
 		    } /* for i ... */
-			
+
 		    Lrowind_bc_ptr[ljb] = index;
-		    Lnzval_bc_ptr[ljb] = lusup; 
+		    Lnzval_bc_ptr[ljb] = lusup;
 
 			/* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb]
                            and Lnzval_bc_ptr[ljb] here.  */
@@ -1071,15 +1071,15 @@ pddistribute(fact_t fact, int_t n, Super
 					uu=nrbl-2;
 					lloc = &Lindval_loc_bc_ptr[ljb][1];
 				}else{
-					uu=nrbl-1;	
+					uu=nrbl-1;
 					lloc = Lindval_loc_bc_ptr[ljb];
-				}	
-				quickSortM(lloc,0,uu,nrbl,0,3);	
+				}
+				quickSortM(lloc,0,uu,nrbl,0,3);
 			}
 
 
-			if ( !(index_srt = intMalloc_dist(len1)) ) 
-				ABORT("Malloc fails for index_srt[]");				
+			if ( !(index_srt = intMalloc_dist(len1)) )
+				ABORT("Malloc fails for index_srt[]");
 			if (!(lusup_srt = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double))))
 				ABORT("Malloc fails for lusup_srt[]");
 
@@ -1094,26 +1094,26 @@ pddistribute(fact_t fact, int_t n, Super
 					index_srt[idx_indx++] = index[Lindval_loc_bc_ptr[ljb][i+nrbl]+jj];
 				}
 
-				Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow; 
+				Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow;
 
 				for (jj=0;jj<nbrow;jj++){
 					k=idx_lusup;
 					k1=Lindval_loc_bc_ptr[ljb][i+nrbl*2]+jj;
-					for (j = 0; j < nsupc; ++j) {				
+					for (j = 0; j < nsupc; ++j) {
 						lusup_srt[k] = lusup[k1];
 						k += len;
 						k1 += len;
-					}	
+					}
 					idx_lusup++;
-				}				
-				Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;	
+				}
+				Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;
 			}
 
 			SUPERLU_FREE(lusup);
 			SUPERLU_FREE(index);
 
 			Lrowind_bc_ptr[ljb] = index_srt;
-			Lnzval_bc_ptr[ljb] = lusup_srt; 			
+			Lnzval_bc_ptr[ljb] = lusup_srt;
 
 			// if(ljb==0)
 			// for (jj=0;jj<nrbl*3;jj++){
@@ -1122,15 +1122,15 @@ pddistribute(fact_t fact, int_t n, Super
 			// }
 			// for (jj=0;jj<nrbl;jj++){
 			// printf("iam %5d Lindval %5d\n",iam, index[Lindval_loc_bc_ptr[ljb][jj+nrbl]]);
-			// fflush(stdout);			
+			// fflush(stdout);
 
-			// }	
+			// }
 		} else {
 		    Lrowind_bc_ptr[ljb] = NULL;
 		    Lnzval_bc_ptr[ljb] = NULL;
 			Linv_bc_ptr[ljb] = NULL;
 			Uinv_bc_ptr[ljb] = NULL;
-			Lindval_loc_bc_ptr[ljb] = NULL;			
+			Lindval_loc_bc_ptr[ljb] = NULL;
 		} /* if nrbl ... */
 #if ( PROFlevel>=1 )
 		t_l += SuperLU_timer_() - t;
@@ -1140,7 +1140,7 @@ pddistribute(fact_t fact, int_t n, Super
 	} /* for jb ... */
 
 	/////////////////////////////////////////////////////////////////
-	
+
 	/* Set up additional pointers for the index and value arrays of U.
 	   nub is the number of local block columns. */
 	nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */
@@ -1154,7 +1154,7 @@ pddistribute(fact_t fact, int_t n, Super
 		ABORT("Malloc fails for Ucb_valptr[]");
 	nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
 
-	/* Count number of row blocks in a block column. 
+	/* Count number of row blocks in a block column.
 	   One pass of the skeleton graph of U. */
 	for (lk = 0; lk < nlb; ++lk) {
 		usub1 = Ufstnz_br_ptr[lk];
@@ -1193,20 +1193,20 @@ pddistribute(fact_t fact, int_t n, Super
 
 				Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
 				Ucb_valptr[ljb][Urbs1[ljb]] = j;
-				
+
 				++Urbs1[ljb];
 				j += usub1[i+1];
 				i += UB_DESCRIPTOR + SuperSize( k );
 			}
 		}
-	}				
-	
+	}
+
 
-/* Count the nnzs per block column */	
+/* Count the nnzs per block column */
 	for (lb = 0; lb < nub; ++lb) {
 		Unnz[lb] = 0;
 		k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
-		knsupc = SuperSize( k );	
+		knsupc = SuperSize( k );
 		for (ub = 0; ub < Urbs[lb]; ++ub) {
 			ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */
 			i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */
@@ -1220,41 +1220,41 @@ pddistribute(fact_t fact, int_t n, Super
 				}
 			} /* for jj ... */
 		}
-	}			
-	
+	}
+
 	/////////////////////////////////////////////////////////////////
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif				
+#endif
 	/* construct the Bcast tree for L ... */
 
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
 	if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 		ABORT("Malloc fails for LBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_BC[].");	
+		ABORT("Malloc fails for SeedSTD_BC[].");
+
 
-		
 	for (i=0;i<k;i++){
-		SeedSTD_BC[i]=rand();		
+		SeedSTD_BC[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 	for (ljb = 0; ljb <k ; ++ljb) {
 		LBtree_ptr[ljb]=NULL;
-	}			
-	
+	}
+
 
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
-	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll		
-	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;	
+		ABORT("Calloc fails for ActiveFlag[].");
+	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -1270,10 +1270,10 @@ pddistribute(fact_t fact, int_t n, Super
 			ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb);
 		} /* for j ... */
 		}
-	}			
-	
+	}
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
-		
+
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
 		pc = PCOL( jb, grid );
@@ -1282,19 +1282,19 @@ pddistribute(fact_t fact, int_t n, Super
 		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-		Root=-1; 
-		Iactive = 0;				
+		Root=-1;
+		Iactive = 0;
 		for (j=0;j<grid->nprow;++j){
 			if(ActiveFlag[j]!=3*nsupers){
 			gb = ActiveFlag[j];
 			pr = PROW( gb, grid );
 			if(gb==jb)Root=pr;
-			if(myrow==pr)Iactive=1;		
-			}					
+			if(myrow==pr)Iactive=1;
+			}
 		}
-		
 
-		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);	
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);
 
 		if(Iactive==1){
 			// printf("jb %5d damn\n",jb);
@@ -1307,7 +1307,7 @@ pddistribute(fact_t fact, int_t n, Super
 					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 					++rank_cnt;
 				}
-			}		
+			}
 
 			if(rank_cnt>1){
 
@@ -1317,7 +1317,7 @@ pddistribute(fact_t fact, int_t n, Super
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');  	
+				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
 				BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d');
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
@@ -1328,15 +1328,15 @@ pddistribute(fact_t fact, int_t n, Super
 				// fflush(stdout);
 				// }
 
-				// #if ( PRNTlevel>=1 )		
+				// #if ( PRNTlevel>=1 )
 				if(Root==myrow){
 					rank_cnt_ref=1;
 					for (j = 0; j < grid->nprow; ++j) {
-						if ( fsendx_plist[ljb][j] != EMPTY ) {	
-							++rank_cnt_ref;		
+						if ( fsendx_plist[ljb][j] != EMPTY ) {
+							++rank_cnt_ref;
 						}
 					}
-					assert(rank_cnt==rank_cnt_ref);		
+					assert(rank_cnt==rank_cnt_ref);
 
 					// printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt);
 
@@ -1345,27 +1345,27 @@ pddistribute(fact_t fact, int_t n, Super
 					// // printf("\n");
 				}
 				// #endif
-			}	
+			}
 		}
 		}
 	}
 
-	
+
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
 	SUPERLU_FREE(ranks);
 	SUPERLU_FREE(SeedSTD_BC);
-	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll	
-	
+	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
-#endif			
+#endif
 
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif			
+#endif
 	/* construct the Reduce tree for L ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1394,24 +1394,24 @@ if ( !iam) printf(".. Construct Bcast tr
 	if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 		ABORT("Malloc fails for LRtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 
 	// if ( !(idxs = intCalloc_dist(nsupers)) )
-		// ABORT("Calloc fails for idxs[].");	
+		// ABORT("Calloc fails for idxs[].");
 
 	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 		// ABORT("Malloc fails for nzrows[].");
 
 	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_RD[].");	
+		ABORT("Malloc fails for SeedSTD_RD[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_RD[i]=rand();		
+		SeedSTD_RD[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 
 	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
@@ -1437,11 +1437,11 @@ if ( !iam) printf(".. Construct Bcast tr
 		LRtree_ptr[lib]=NULL;
 	}
 
-	
+
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-	memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll						
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
+	memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
 	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		fsupc = FstBlockC( jb );
 		pc = PCOL( jb, grid );
@@ -1456,7 +1456,7 @@ if ( !iam) printf(".. Construct Bcast tr
 		}
 	}
 
-	
+
 	for (lib=0;lib<k;++lib){
 		ib = myrow+lib*grid->nprow;  /* not sure */
 		if(ib<nsupers){
@@ -1464,19 +1464,19 @@ if ( !iam) printf(".. Construct Bcast tr
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 
 			for (j=0;j<grid->npcol;++j){
 				if(ActiveFlag[j]!=-3*nsupers){
 				jb = ActiveFlag[j];
 				pc = PCOL( jb, grid );
 				if(jb==ib)Root=pc;
-				if(mycol==pc)Iactive=1;		
-				}					
+				if(mycol==pc)Iactive=1;
+				}
 			}
-		
-		
+
+
 			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2);
 
 			if(Iactive==1){
@@ -1492,7 +1492,7 @@ if ( !iam) printf(".. Construct Bcast tr
 				if(rank_cnt>1){
 
 					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-						ranks[ii] = PNUM( pr, ranks[ii], grid );		
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 					// rseed=rand();
 					// rseed=1.0;
@@ -1500,7 +1500,7 @@ if ( !iam) printf(".. Construct Bcast tr
 
 					// if(ib==0){
 
-					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');  	
+					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
 					RdTree_SetTag(LRtree_ptr[lib], RD_L,'d');
 					// }
 
@@ -1512,7 +1512,7 @@ if ( !iam) printf(".. Construct Bcast tr
 					// if(iam==15 || iam==3){
 					// printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'d'));
 					// fflush(stdout);
-					// }		
+					// }
 
 
 					// #if ( PRNTlevel>=1 )
@@ -1523,10 +1523,10 @@ if ( !iam) printf(".. Construct Bcast tr
 					// // // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 					// // printf("\n");
 					// }
-					// #endif		
+					// #endif
 				}
-			}				
-		}	
+			}
+		}
 	}
 
 	SUPERLU_FREE(mod_bit);
@@ -1535,24 +1535,24 @@ if ( !iam) printf(".. Construct Bcast tr
 
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);	
-	// SUPERLU_FREE(idxs);	 
-	SUPERLU_FREE(SeedSTD_RD);	
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
 	// for(i=0;i<nsupers;++i){
 		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 	// }
 	// SUPERLU_FREE(nzrows);
-	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll	
+	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll
 		////////////////////////////////////////////////////////
 
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_();
-#endif	
+#endif
 
 	/* construct the Bcast tree for U ... */
 
@@ -1560,28 +1560,28 @@ if ( !iam) printf(".. Construct Reduce t
 	if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 		ABORT("Malloc fails for UBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_BC[].");	
+		ABORT("Malloc fails for SeedSTD_BC[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_BC[i]=rand();		
+		SeedSTD_BC[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 
 	for (ljb = 0; ljb <k ; ++ljb) {
 		UBtree_ptr[ljb]=NULL;
-	}	
+	}
 
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll	
-	
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
+	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -1598,21 +1598,21 @@ if ( !iam) printf(".. Construct Reduce t
 				pr = PROW( gb, grid );
 				ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb);
 			// printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers);
-			// fflush(stdout);								
+			// fflush(stdout);
 				//if(gb==jb)Root=pr;
 			}
-			
-			
+
+
 		}
 		pr = PROW( jb, grid ); // take care of diagonal node stored as L
 		// printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]);
 		// fflush(stdout);
-		ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb);	
+		ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb);
 		}
-	}	
-		
-		
-		
+	}
+
+
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -1623,18 +1623,18 @@ if ( !iam) printf(".. Construct Reduce t
 		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-		Root=-1; 
-		Iactive = 0;				
+		Root=-1;
+		Iactive = 0;
 		for (j=0;j<grid->nprow;++j){
 			if(ActiveFlag[j]!=-3*nsupers){
 			gb = ActiveFlag[j];
 			pr = PROW( gb, grid );
 			if(gb==jb)Root=pr;
-			if(myrow==pr)Iactive=1;		
+			if(myrow==pr)Iactive=1;
 			}
-		}						
-		
-		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);	
+		}
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);
 	// printf("jb: %5d Iactive %5d\n",jb,Iactive);
 	// fflush(stdout);
 		if(Iactive==1){
@@ -1648,7 +1648,7 @@ if ( !iam) printf(".. Construct Reduce t
 					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 					++rank_cnt;
 				}
-			}		
+			}
 	// printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt);
 	// fflush(stdout);
 			if(rank_cnt>1){
@@ -1658,43 +1658,43 @@ if ( !iam) printf(".. Construct Reduce t
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');  	
+				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
 				BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d');
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 				// fflush(stdout);
-				
+
 				if(Root==myrow){
 				rank_cnt_ref=1;
 				for (j = 0; j < grid->nprow; ++j) {
 					// printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow);
 					// fflush(stdout);
-					if ( bsendx_plist[ljb][j] != EMPTY ) {	
-						++rank_cnt_ref;		
+					if ( bsendx_plist[ljb][j] != EMPTY ) {
+						++rank_cnt_ref;
 					}
 				}
 				// printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref);
-				// fflush(stdout);								
-				assert(rank_cnt==rank_cnt_ref);		
-				}						
+				// fflush(stdout);
+				assert(rank_cnt==rank_cnt_ref);
+				}
 			}
 		}
 		}
-	}	
+	}
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);				
-	SUPERLU_FREE(SeedSTD_BC);				
+	SUPERLU_FREE(ranks);
+	SUPERLU_FREE(SeedSTD_BC);
 	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
-	
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif					
+#endif
 	/* construct the Reduce tree for U ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1723,46 +1723,46 @@ if ( !iam) printf(".. Construct Bcast tr
 	if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 		ABORT("Malloc fails for URtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 
 	// if ( !(idxs = intCalloc_dist(nsupers)) )
-		// ABORT("Calloc fails for idxs[].");	
+		// ABORT("Calloc fails for idxs[].");
 
 	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 		// ABORT("Malloc fails for nzrows[].");
 
 	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_RD[].");	
+		ABORT("Malloc fails for SeedSTD_RD[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_RD[i]=rand();		
+		SeedSTD_RD[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 
 	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		// fsupc = FstBlockC( jb );
-		// len=0;  
+		// len=0;
 		// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 			// istart = xusub[j];
 			// /* NOTE: Only the first nonzero index of the segment
 			   // is stored in usub[]. */
-			// len +=  xusub[j+1] - xusub[j];  
-		// }	
-				
+			// len +=  xusub[j+1] - xusub[j];
+		// }
+
 		// idxs[jb] = len-1;
 
 		// if(len>0){
 			// if ( !(nzrows[jb] = intMalloc_dist(len)) )
 				// ABORT("Malloc fails for nzrows[jb]");
-			
+
 			// fsupc = FstBlockC( jb );
-			
-			// len=0; 
-			
+
+			// len=0;
+
 			// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 				// istart = xusub[j];
 				// /* NOTE: Only the first nonzero index of the segment
@@ -1772,29 +1772,29 @@ if ( !iam) printf(".. Construct Bcast tr
 					// nzrows[jb][len]=irow;
 					// len++;
 				// }
-			// }	
+			// }
 			// quickSort(nzrows[jb],0,len-1,0);
 		// }
 		// else{
 			// nzrows[jb] = NULL;
 		// }
 	// }
-	
+
 
 	for (lib = 0; lib <k ; ++lib) {
 		URtree_ptr[lib]=NULL;
 	}
 
-	
+
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;	
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;
 	memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
-	
+
 	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		fsupc = FstBlockC( jb );
 		pc = PCOL( jb, grid );
-		
+
 		fsupc = FstBlockC( jb );
 		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 			istart = xusub[j];
@@ -1807,17 +1807,17 @@ if ( !iam) printf(".. Construct Bcast tr
 				if ( myrow == pr ) { /* Block row ib in my process row */
 					lib = LBi( ib, grid ); /* Local block number */
 					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
-				}						
+				}
 			}
 		}
-		
+
 		pr = PROW( jb, grid );
 		if ( myrow == pr ) { /* Block row ib in my process row */
 			lib = LBi( jb, grid ); /* Local block number */
 			ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
-		}					
+		}
 	}
-		
+
 
 	for (lib=0;lib<k;++lib){
 		ib = myrow+lib*grid->nprow;  /* not sure */
@@ -1826,18 +1826,18 @@ if ( !iam) printf(".. Construct Bcast tr
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 
 			for (j=0;j<grid->npcol;++j){
 				if(ActiveFlag[j]!=3*nsupers){
 				jb = ActiveFlag[j];
 				pc = PCOL( jb, grid );
 				if(jb==ib)Root=pc;
-				if(mycol==pc)Iactive=1;		
-				}					
+				if(mycol==pc)Iactive=1;
+				}
 			}
-			
+
 			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2);
 
 			if(Iactive==1){
@@ -1853,7 +1853,7 @@ if ( !iam) printf(".. Construct Bcast tr
 				if(rank_cnt>1){
 
 					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-						ranks[ii] = PNUM( pr, ranks[ii], grid );		
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 					// rseed=rand();
 					// rseed=1.0;
@@ -1861,7 +1861,7 @@ if ( !iam) printf(".. Construct Bcast tr
 
 					// if(ib==0){
 
-					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');  	
+					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
 					RdTree_SetTag(URtree_ptr[lib], RD_U,'d');
 					// }
 
@@ -1875,10 +1875,10 @@ if ( !iam) printf(".. Construct Bcast tr
 					// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 					// printf("\n");
 					}
-					// #endif		
+					// #endif
 				}
 			}
-		}						
+		}
 	}
 	SUPERLU_FREE(mod_bit);
 	SUPERLU_FREE(brecv);
@@ -1886,26 +1886,26 @@ if ( !iam) printf(".. Construct Bcast tr
 
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);	
-	// SUPERLU_FREE(idxs);	
-	SUPERLU_FREE(SeedSTD_RD);	
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
 	// for(i=0;i<nsupers;++i){
 		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 	// }
-	// SUPERLU_FREE(nzrows);				
-		
-	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll			
-		
+	// SUPERLU_FREE(nzrows);
+
+	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
-#endif						
-		
+#endif
+
 	////////////////////////////////////////////////////////
 
-	
+
 	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;  
+	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
 	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
 	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
 	Llu->Unzval_br_ptr = Unzval_br_ptr;
@@ -1923,16 +1923,16 @@ if ( !iam) printf(".. Construct Reduce t
 	Llu->nbsendx = nbsendx;
 	Llu->ilsum = ilsum;
 	Llu->ldalsum = ldaspa;
-	
+
 	Llu->LRtree_ptr = LRtree_ptr;
 	Llu->LBtree_ptr = LBtree_ptr;
 	Llu->URtree_ptr = URtree_ptr;
 	Llu->UBtree_ptr = UBtree_ptr;
 	Llu->Linv_bc_ptr = Linv_bc_ptr;
-	Llu->Uinv_bc_ptr = Uinv_bc_ptr;	
-	Llu->Urbs = Urbs; 
-	Llu->Ucb_indptr = Ucb_indptr; 
-	Llu->Ucb_valptr = Ucb_valptr; 
+	Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+	Llu->Urbs = Urbs;
+	Llu->Ucb_indptr = Ucb_indptr;
+	Llu->Ucb_valptr = Ucb_valptr;
 
 
 #if ( PRNTlevel>=1 )
@@ -1951,7 +1951,7 @@ if ( !iam) printf(".. Construct Reduce t
 	SUPERLU_FREE(dense);
 
 	/* Find the maximum buffer size. */
-	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
 		      MPI_MAX, grid->comm);
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1978,7 +1978,7 @@ if ( !iam) printf(".. Construct Reduce t
        ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
     CHECK_MALLOC(iam, "Exit pddistribute()");
 #endif
-    
+
     return (mem_use+memTRS);
 
 } /* PDDISTRIBUTE */
diff -pruN 6.1.0+dfsg1-1/SRC/pdGetDiagU.c 6.1.1+dfsg1-1/SRC/pdGetDiagU.c
--- 6.1.0+dfsg1-1/SRC/pdGetDiagU.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdGetDiagU.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 /*! @file p@(pre)GetDiagU.c
- * \brief Extracts the main diagonal of matrix U 
+ * \brief Extracts the main diagonal of matrix U
  *
  * <pre>
  * -- Auxiliary routine in distributed SuperLU (version 5.1.0) --
@@ -31,7 +31,7 @@ at the top-level directory.
  * =======
  *
  * GetDiagU extracts the main diagonal of matrix U of the LU factorization.
- *  
+ *
  * Arguments
  * =========
  *
diff -pruN 6.1.0+dfsg1-1/SRC/pdgsequ.c 6.1.1+dfsg1-1/SRC/pdgsequ.c
--- 6.1.0+dfsg1-1/SRC/pdgsequ.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgsequ.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Computes row and column scalings
  *
  * File name:	pdgsequ.c
@@ -21,64 +21,64 @@ at the top-level directory.
 
 /*! \brief
 
- <pre>    
-    Purpose   
-    =======   
+ <pre>
+    Purpose
+    =======
 
-    PDGSEQU computes row and column scalings intended to equilibrate an   
+    PDGSEQU computes row and column scalings intended to equilibrate an
     M-by-N sparse matrix A and reduce its condition number. R returns the row
-    scale factors and C the column scale factors, chosen to try to make   
-    the largest element in each row and column of the matrix B with   
-    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
-
-    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
-    number and BIGNUM = largest safe number.  Use of these scaling   
-    factors is not guaranteed to reduce the condition number of A but   
-    works well in practice.   
+    scale factors and C the column scale factors, chosen to try to make
+    the largest element in each row and column of the matrix B with
+    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+
+    R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+    number and BIGNUM = largest safe number.  Use of these scaling
+    factors is not guaranteed to reduce the condition number of A but
+    works well in practice.
 
     See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
+
+    Arguments
+    =========
 
     A       (input) SuperMatrix*
             The matrix of dimension (A->nrow, A->ncol) whose equilibration
             factors are to be computed. The type of A can be:
             Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
-	    
+
     R       (output) double*, size A->nrow
-            If INFO = 0 or INFO > M, R contains the row scale factors   
+            If INFO = 0 or INFO > M, R contains the row scale factors
             for A.
-	    
+
     C       (output) double*, size A->ncol
             If INFO = 0,  C contains the column scale factors for A.
-	    
+
     ROWCND  (output) double*
-            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
-            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
-            AMAX is neither too large nor too small, it is not worth   
+            If INFO = 0 or INFO > M, ROWCND contains the ratio of the
+            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and
+            AMAX is neither too large nor too small, it is not worth
             scaling by R.
-	    
+
     COLCND  (output) double*
-            If INFO = 0, COLCND contains the ratio of the smallest   
-            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+            If INFO = 0, COLCND contains the ratio of the smallest
+            C(i) to the largest C(i).  If COLCND >= 0.1, it is not
             worth scaling by C.
-	    
+
     AMAX    (output) double*
-            Absolute value of largest matrix element.  If AMAX is very   
-            close to overflow or very close to underflow, the matrix   
+            Absolute value of largest matrix element.  If AMAX is very
+            close to overflow or very close to underflow, the matrix
             should be scaled.
-	    
+
     INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-            > 0:  if INFO = i,  and i is   
-                  <= M:  the i-th row of A is exactly zero   
-                  >  M:  the (i-M)-th column of A is exactly zero   
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i,  and i is
+                  <= M:  the i-th row of A is exactly zero
+                  >  M:  the (i-M)-th column of A is exactly zero
 
     GRID    (input) gridinof_t*
             The 2D process mesh.
-    ===================================================================== 
+    =====================================================================
 </pre>
 */
 
@@ -98,7 +98,7 @@ pdgsequ(SuperMatrix *A, double *r, doubl
     int *r_sizes, *displs;
     double *loc_r;
     int_t  procs;
-    
+
     /* Test the input parameters. */
     *info = 0;
     if ( A->nrow < 0 || A->ncol < 0 ||
@@ -121,7 +121,7 @@ pdgsequ(SuperMatrix *A, double *r, doubl
     Astore = A->Store;
     Aval = Astore->nzval;
     m_loc = Astore->m_loc;
-    
+
     /* Get machine constants. */
     smlnum = dmach_dist("S");
     bignum = 1. / smlnum;
@@ -144,13 +144,13 @@ pdgsequ(SuperMatrix *A, double *r, doubl
 	rcmax = SUPERLU_MAX(rcmax, r[i]);
 	rcmin = SUPERLU_MIN(rcmin, r[i]);
     }
-  
+
     /* Get the global MAX and MIN for R */
     tempmax = rcmax;
     tempmin = rcmin;
-    MPI_Allreduce( &tempmax, &rcmax, 
+    MPI_Allreduce( &tempmax, &rcmax,
 		1, MPI_DOUBLE, MPI_MAX, grid->comm);
-    MPI_Allreduce( &tempmin, &rcmin, 
+    MPI_Allreduce( &tempmin, &rcmin,
 		1, MPI_DOUBLE, MPI_MIN, grid->comm);
 
     *amax = rcmax;
@@ -227,7 +227,7 @@ pdgsequ(SuperMatrix *A, double *r, doubl
 
     /* First gather the size of each piece. */
     MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm);
-      
+
     /* Set up the displacements for allgatherv */
     displs[0] = 0;
     for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1];
@@ -235,7 +235,7 @@ pdgsequ(SuperMatrix *A, double *r, doubl
     /* Now gather the actual data */
     MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs,
                 MPI_DOUBLE, grid->comm);
-      
+
     SUPERLU_FREE(r_sizes);
     SUPERLU_FREE(loc_r);
 
diff -pruN 6.1.0+dfsg1-1/SRC/pdgsmv_AXglobal.c 6.1.1+dfsg1-1/SRC/pdgsmv_AXglobal.c
--- 6.1.0+dfsg1-1/SRC/pdgsmv_AXglobal.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgsmv_AXglobal.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Performs sparse matrix-vector multiplication
  *
  * <pre>
@@ -97,14 +97,14 @@ int pdgsmv_AXglobal_setup
 		    mv_sup_to_proc[i] = p;
 #if ( DEBUGlevel>=3 )
 		    if ( mv_sup_to_proc[i] == p-1 ) {
-			fprintf(stderr, 
+			fprintf(stderr,
 				"mv_sup_to_proc conflicts at supno %d\n", i);
 			exit(-1);
 		    }
 #endif
 		}
 	    }
-	    
+
 	    if ( iam == p ) {
 		N_update = t1;
 		if ( N_update ) {
@@ -163,7 +163,7 @@ int pdgsmv_AXglobal_setup
  *    val[m]        = not used
  *    val[ki]       = A(k, bindx[ki]), where ks <= ki <= ke
  * Both arrays are of length nnz + 1.
- * </pre> 
+ * </pre>
 */
 static void dcreate_msr_matrix
 (
@@ -181,7 +181,7 @@ static void dcreate_msr_matrix
     double *nzval;
     int_t *rowcnt;
     double zero = 0.0;
-    
+
     if ( !N_update ) return;
 
     n = A->ncol;
@@ -274,7 +274,7 @@ pdgsmv_AXglobal(int_t m, int_t update[],
     }
     return 0;
 } /* PDGSMV_AXglobal */
- 
+
 /*
  * Performs sparse matrix-vector multiplication.
  *   - val/bindx stores the distributed MSR matrix A
@@ -297,7 +297,7 @@ pdgsmv_AXglobal_abs(int_t m, int_t updat
 	}
 	ax[i] += fabs(val[i]) * fabs(X[update[i]]); /* diagonal */
     }
-    
+
     return 0;
 } /* PDGSMV_AXglobal_ABS */
 
diff -pruN 6.1.0+dfsg1-1/SRC/pdgsmv.c 6.1.1+dfsg1-1/SRC/pdgsmv.c
--- 6.1.0+dfsg1-1/SRC/pdgsmv.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgsmv.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief  Parallel sparse matrix-vector multiplication
  *
  * <pre>
@@ -144,7 +144,7 @@ void pdgsmv_init
 	    }
 	}
     }
-    
+
     /* ------------------------------------------------------------
        TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES.
        THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A.
@@ -212,7 +212,7 @@ void pdgsmv_init
     gsmv_comm->val_torecv = val_torecv;
     gsmv_comm->TotalIndSend = TotalIndSend;
     gsmv_comm->TotalValSend = TotalValSend;
-    
+
     SUPERLU_FREE(spa);
     SUPERLU_FREE(send_req);
 
@@ -311,7 +311,7 @@ pdgsmv
                       grid->comm, &recv_req[p]);
 	}
     }
-    
+
     /* ------------------------------------------------------------
        PERFORM THE ACTUAL MULTIPLICATION.
        ------------------------------------------------------------*/
diff -pruN 6.1.0+dfsg1-1/SRC/pdgsrfs_ABXglobal.c 6.1.1+dfsg1-1/SRC/pdgsrfs_ABXglobal.c
--- 6.1.0+dfsg1-1/SRC/pdgsrfs_ABXglobal.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgsrfs_ABXglobal.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Improves the computed solution and provies error bounds
  *
  * <pre>
@@ -39,9 +39,9 @@ static void redist_all_to_diag(int_t, do
  * Purpose
  * =======
  *
- * pdgsrfs_ABXglobal improves the computed solution to a system of linear   
+ * pdgsrfs_ABXglobal improves the computed solution to a system of linear
  * equations and provides error bounds and backward error estimates
- * for the solution. 
+ * for the solution.
  *
  * Arguments
  * =========
@@ -79,7 +79,7 @@ static void redist_all_to_diag(int_t, do
  * B      (input) double* (global)
  *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
  *        and row permuted system.
- *       
+ *
  *        NOTE: Currently, B must reside on all processes when calling
  *              this routine.
  *
@@ -102,8 +102,8 @@ static void redist_all_to_diag(int_t, do
  *        Number of right-hand sides.
  *
  * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
  *         any element of A or B that makes X(j) an exact solution).
  *
  * stat   (output) SuperLUStat_t*
@@ -113,11 +113,11 @@ static void redist_all_to_diag(int_t, do
  * info   (output) int*
  *        = 0: successful exit
  *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
  *
- * ITMAX is the maximum number of steps of iterative refinement.   
+ * Internal Parameters
+ * ===================
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.
  * </pre>
  */
 
@@ -129,14 +129,14 @@ pdgsrfs_ABXglobal(int_t n, SuperMatrix *
 
 
 #define ITMAX 20
-    
+
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
-    /* 
+    /*
      * Data structures used by matrix-vector multiply routine.
      */
     int_t  N_update; /* Number of variables updated on this process */
-    int_t  *update;  /* vector elements (global index) updated 
+    int_t  *update;  /* vector elements (global index) updated
 			on this processor.                     */
     int_t  *bindx;
     double *val;
@@ -159,7 +159,7 @@ pdgsrfs_ABXglobal(int_t n, SuperMatrix *
     /*-- Function prototypes --*/
     extern void pdgstrs1(int_t, LUstruct_t *, gridinfo_t *,
 			 double *, int, SuperLUStat_t *, int *);
-    
+
     /* Test the input parameters. */
     *info = 0;
     if ( n < 0 ) *info = -1;
@@ -283,19 +283,19 @@ pdgsrfs_ABXglobal(int_t n, SuperMatrix *
 
 	while (1) { /* Loop until stopping criterion is satisfied. */
 
-	    /* Compute residual R = B - op(A) * X,   
+	    /* Compute residual R = B - op(A) * X,
 	       where op(A) = A, A**T, or A**H, depending on TRANS. */
 
 	    /* Matrix-vector multiply. */
 	    pdgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
-	    
+
 	    /* Compute residual. */
 	    for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i];
 
 	    /* Compute abs(op(A))*abs(X) + abs(B). */
 	    pdgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp);
 	    for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]);
-	    
+
 	    s = 0.0;
 	    for (i = 0; i < N_update; ++i) {
 		if ( temp[i] > safe2 ) {
@@ -309,7 +309,7 @@ pdgsrfs_ABXglobal(int_t n, SuperMatrix *
                    we know the true residual also must be exactly 0.0. */
 	    }
 	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
+
 #if ( PRNTlevel>= 1 )
 	    if ( !iam )
 		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
@@ -321,7 +321,7 @@ pdgsrfs_ABXglobal(int_t n, SuperMatrix *
 		pdgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);
 
 		/* Update solution. */
-		for (p = 0; p < num_diag_procs; ++p) 
+		for (p = 0; p < num_diag_procs; ++p)
 		    if ( iam == diag_procs[p] )
 			for (k = p; k < nsupers; k += num_diag_procs) {
 			    lk = LBi( k, grid );
@@ -334,7 +334,7 @@ pdgsrfs_ABXglobal(int_t n, SuperMatrix *
 		++count;
 		/* Transfer x_trs (on diagonal processes) into X
 		   (on all processes). */
-		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, 
+		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid,
 					num_diag_procs, diag_procs, diag_len,
 					X_col, temp);
 	    } else {
@@ -381,7 +381,7 @@ redist_all_to_diag(int_t n, double r[],
     int_t *ilsum, *xsup;
     int iam, knsupc, psrc, pkk;
     MPI_Status status;
-    
+
     iam = grid->iam;
     nsupers = Glu_persist->supno[n-1] + 1;
     xsup = Glu_persist->xsup;
@@ -430,7 +430,7 @@ gather_1rhs_diag_to_all(int_t n, double
     int_t i, ii, k, lk, lwork, nsupers, p;
     int_t *ilsum, *xsup;
     int iam, knsupc, pkk;
-    
+
     iam = grid->iam;
     nsupers = Glu_persist->supno[n-1] + 1;
     xsup = Glu_persist->xsup;
diff -pruN 6.1.0+dfsg1-1/SRC/pdgsrfs.c 6.1.1+dfsg1-1/SRC/pdgsrfs.c
--- 6.1.0+dfsg1-1/SRC/pdgsrfs.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgsrfs.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Improves the computed solution to a system of linear equations and provides error bounds and backward error estimates
  *
  * <pre>
@@ -26,15 +26,15 @@ at the top-level directory.
 #include <math.h>
 #include "superlu_ddefs.h"
 
-/*! \brief 
+/*! \brief
  *
  * <pre>
  * Purpose
  * =======
  *
- * PDGSRFS improves the computed solution to a system of linear   
+ * PDGSRFS improves the computed solution to a system of linear
  * equations and provides error bounds and backward error estimates
- * for the solution. 
+ * for the solution.
  *
  * Arguments
  * =========
@@ -72,7 +72,7 @@ at the top-level directory.
  * B      (input) double* (local)
  *        The m_loc-by-NRHS right-hand side matrix of the possibly
  *        equilibrated system. That is, B may be overwritten by diag(R)*B.
- *       
+ *
  * ldb    (input) int (local)
  *        Leading dimension of matrix B.
  *
@@ -98,8 +98,8 @@ at the top-level directory.
  *        solution phase.
  *
  * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
  *         any element of A or B that makes X(j) an exact solution).
  *
  * stat   (output) SuperLUStat_t*
@@ -109,22 +109,22 @@ at the top-level directory.
  * info   (output) int*
  *        = 0: successful exit
  *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
  *
- * ITMAX is the maximum number of steps of iterative refinement.   
+ * Internal Parameters
+ * ===================
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.
  * </pre>
  */
 void
 pdgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
 	ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid,
-	double *B, int_t ldb, double *X, int_t ldx, int nrhs, 
+	double *B, int_t ldb, double *X, int_t ldx, int nrhs,
 	SOLVEstruct_t *SOLVEstruct,
 	double *berr, SuperLUStat_t *stat, int *info)
 {
 #define ITMAX 20
-    
+
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
     double *ax, *R, *dx, *temp, *work, *B_col, *X_col;
@@ -200,19 +200,19 @@ pdgsrfs(int_t n, SuperMatrix *A, double
 
 	while (1) { /* Loop until stopping criterion is satisfied. */
 
-	    /* Compute residual R = B - op(A) * X,   
+	    /* Compute residual R = B - op(A) * X,
 	       where op(A) = A, A**T, or A**H, depending on TRANS. */
 
 	    /* Matrix-vector multiply. */
 	    pdgsmv(0, A, grid, gsmv_comm, X_col, ax);
-	    
+
 	    /* Compute residual, stored in R[]. */
 	    for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i];
 
 	    /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */
 	    pdgsmv(1, A, grid, gsmv_comm, X_col, temp);
 	    for (i = 0; i < m_loc; ++i) temp[i] += fabs(B_col[i]);
-	    
+
 	    s = 0.0;
 	    for (i = 0; i < m_loc; ++i) {
 		if ( temp[i] > safe2 ) {
@@ -226,7 +226,7 @@ pdgsrfs(int_t n, SuperMatrix *A, double
                    we know the true residual also must be exactly 0.0. */
 	    }
 	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
+
 #if ( PRNTlevel>= 1 )
 	    if ( !iam )
 		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
@@ -234,7 +234,7 @@ pdgsrfs(int_t n, SuperMatrix *A, double
 	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
 		/* Compute new dx. */
 		pdgstrs(n, LUstruct, ScalePermstruct, grid,
-			dx, m_loc, fst_row, m_loc, 1, 
+			dx, m_loc, fst_row, m_loc, 1,
 			SOLVEstruct, stat, info);
 
 		/* Update solution. */
diff -pruN 6.1.0+dfsg1-1/SRC/pdgssvx_ABglobal.c 6.1.1+dfsg1-1/SRC/pdgssvx_ABglobal.c
--- 6.1.0+dfsg1-1/SRC/pdgssvx_ABglobal.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgssvx_ABglobal.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Solves a system of linear equations A*X=B,
  *
  * <pre>
@@ -50,7 +50,7 @@ at the top-level directory.
  *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
  *      -  grid, a structure describing the 2D processor mesh
  *      -  options->IterRefine, which determines whether or not to
- *            improve the accuracy of the computed solution using 
+ *            improve the accuracy of the computed solution using
  *            iterative refinement
  *
  *      On output, B is overwritten with the solution X.
@@ -58,8 +58,8 @@ at the top-level directory.
  *   2. Depending on options->Fact, the user has several options
  *      for solving A*X=B. The standard option is for factoring
  *      A "from scratch". (The other options, described below,
- *      are used when A is sufficiently similar to a previously 
- *      solved problem to save time by reusing part or all of 
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
  *      the previous factorization.)
  *
  *      -  options->Fact = DOFACT: A is factored "from scratch"
@@ -68,7 +68,7 @@ at the top-level directory.
  *
  *      -  A, the input matrix
  *
- *      as well as the following options, which are described in more 
+ *      as well as the following options, which are described in more
  *      detail below:
  *
  *      -  options->Equil,   to specify how to scale the rows and columns
@@ -88,7 +88,7 @@ at the top-level directory.
  *                           (to control numerical stability)
  *
  *      The outputs returned include
- *         
+ *
  *      -  ScalePermstruct,  modified to describe how the input matrix A
  *                           was equilibrated and permuted:
  *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
@@ -99,17 +99,17 @@ at the top-level directory.
  *         -  ScalePermstruct->perm_c, column permutation vector
  *
  *            (part of ScalePermstruct may also need to be supplied on input,
- *             depending on options->RowPerm and options->ColPerm as described 
+ *             depending on options->RowPerm and options->ColPerm as described
  *             later).
  *
  *      -  A, the input matrix A overwritten by the scaled and permuted matrix
  *                Pc*Pr*diag(R)*A*diag(C)
- *             where 
+ *             where
  *                Pr and Pc are row and columns permutation matrices determined
- *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c, 
- *                  respectively, and 
+ *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c,
+ *                  respectively, and
  *                diag(R) and diag(C) are diagonal scaling matrices determined
- *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
  *                  ScalePermstruct->C
  *
  *      -  LUstruct, which contains the L and U factorization of A1 where
@@ -121,7 +121,7 @@ at the top-level directory.
  *
  *   3. The second value of options->Fact assumes that a matrix with the same
  *      sparsity pattern as A has already been factored:
- *     
+ *
  *      -  options->Fact = SamePattern: A is factored, assuming that it has
  *            the same nonzero pattern as a previously factored matrix. In this
  *            case the algorithm saves time by reusing the previously computed
@@ -137,14 +137,14 @@ at the top-level directory.
  *
  *      but not options->ColPerm, whose value is ignored. This is because the
  *      previous column permutation from ScalePermstruct->perm_c is used as
- *      input. The user must also supply 
+ *      input. The user must also supply
  *
  *      -  A, the input matrix
  *      -  ScalePermstruct->perm_c, the column permutation
  *      -  LUstruct->etree, the elimination tree
  *
  *      The outputs returned include
- *         
+ *
  *      -  A, the input matrix A overwritten by the scaled and permuted matrix
  *            as described above
  *      -  ScalePermstruct,  modified to describe how the input matrix A was
@@ -172,32 +172,32 @@ at the top-level directory.
  *      This is because the permutations from ScalePermstruct->perm_r and
  *      ScalePermstruct->perm_c are used as input.
  *
- *      The user must also supply 
+ *      The user must also supply
  *
  *      -  A, the input matrix
  *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
  *                                     column scaled
  *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
- *      -  ScalePermstruct->C, the columns scalings of the previous matrix, 
+ *      -  ScalePermstruct->C, the columns scalings of the previous matrix,
  *                             if any
  *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
- *      -  ScalePermstruct->perm_c, the column permutation of the previous 
+ *      -  ScalePermstruct->perm_c, the column permutation of the previous
  *                                  matrix
  *      -  all of LUstruct, the previously computed information about L and U
  *                (the actual numerical values of L and U stored in
  *                 LUstruct->Llu are ignored)
  *
  *      The outputs returned include
- *         
+ *
  *      -  A, the input matrix A overwritten by the scaled and permuted matrix
  *            as described above
  *      -  ScalePermstruct,  modified to describe how the input matrix A was
- *                           equilibrated 
+ *                           equilibrated
  *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
  *      -  LUstruct, modified to contain the new L and U factors
  *
  *   5. The fourth and last value of options->Fact assumes that A is
- *      identical to a matrix that has already been factored on a previous 
+ *      identical to a matrix that has already been factored on a previous
  *      call, and reuses its entire LU factorization
  *
  *      -  options->Fact = Factored: A is identical to a previously
@@ -205,19 +205,19 @@ at the top-level directory.
  *            can be reused.
  *
  *      In this case all the other options mentioned above are ignored
- *      (options->Equil, options->RowPerm, options->ColPerm, 
+ *      (options->Equil, options->RowPerm, options->ColPerm,
  *       options->ReplaceTinyPivot)
  *
- *      The user must also supply 
+ *      The user must also supply
  *
  *      -  A, the unfactored matrix, only in the case that iterative refinement
- *            is to be done (specifically A must be the output A from 
+ *            is to be done (specifically A must be the output A from
  *            the previous call, so that it has been scaled and permuted)
  *      -  all of ScalePermstruct
  *      -  all of LUstruct, including the actual numerical values of L and U
  *
  *      all of which are unmodified on output.
- *         
+ *
  * Arguments
  * =========
  *
@@ -225,7 +225,7 @@ at the top-level directory.
  *         The structure defines the input parameters to control
  *         how the LU decomposition will be performed.
  *         The following fields should be defined for this structure:
- *         
+ *
  *         o Fact (fact_t)
  *           Specifies whether or not the factored form of the matrix
  *           A is supplied on entry, and if not, how the matrix A should
@@ -235,7 +235,7 @@ at the top-level directory.
  *                 Inputs:  A
  *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          all of ScalePermstruct
  *                          all of LUstruct
@@ -243,7 +243,7 @@ at the top-level directory.
  *           = SamePattern: the matrix A will be factorized assuming
  *             that a factorization of a matrix with the same sparsity
  *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
+ *             factorization will reuse column permutation vector
  *             ScalePermstruct->perm_c and the elimination tree
  *             LUstruct->etree
  *                 Inputs:  A
@@ -251,7 +251,7 @@ at the top-level directory.
  *                          ScalePermstruct->perm_c
  *                          LUstruct->etree
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
  *                          rest of LUstruct (GLU_persist, Llu)
@@ -269,7 +269,7 @@ at the top-level directory.
  *                          all of ScalePermstruct
  *                          all of LUstruct
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          modified LUstruct->Llu
  *           = FACTORED: the matrix A is already factored.
@@ -298,17 +298,17 @@ at the top-level directory.
  *                        off-diagonal.
  *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
  *                        input by the user.
- *           
+ *
  *         o ColPerm (colperm_t)
  *           Specifies what type of column permutation to use to reduce fill.
  *           = NATURAL:       natural ordering.
  *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
  *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
  *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
- *         
+ *
  *         o ReplaceTinyPivot (yes_no_t)
  *           = NO:  do not modify pivots
- *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
  *                  LU factorization.
  *
  *         o IterRefine (IterRefine_t)
@@ -355,7 +355,7 @@ at the top-level directory.
  *                      diag(R).
  *           = COL:     Column equilibration, i.e., A was postmultiplied
  *                      by diag(C).
- *           = BOTH:    both row and column equilibration, i.e., A was 
+ *           = BOTH:    both row and column equilibration, i.e., A was
  *                      replaced by diag(R)*A*diag(C).
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
  *           DiagScale is an input argument; otherwise it is an output
@@ -369,8 +369,8 @@ at the top-level directory.
  *           input argument; otherwise it is an output argument.
  *
  *         o perm_c (int*)
- *           Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
  *           in position j in A*Pc.
  *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
  *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
@@ -382,7 +382,7 @@ at the top-level directory.
  *
  *         o R (double*) dimension (A->nrow)
  *           The row scale factors for A.
- *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
  *                          diag(R).
  *           If DiagScale = NOEQUIL or COL, R is not defined.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
@@ -390,12 +390,12 @@ at the top-level directory.
  *
  *         o C (double*) dimension (A->ncol)
  *           The column scale factors for A.
- *           If DiagScale = COL or BOTH, A is multiplied on the right by 
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
  *                          diag(C).
  *           If DiagScale = NOEQUIL or ROW, C is not defined.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
  *           an input argument; otherwise, C is an output argument.
- *         
+ *
  * B       (input/output) double*
  *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
  *         On exit, the solution matrix if info = 0;
@@ -447,8 +447,8 @@ at the top-level directory.
  *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
  *
  * berr    (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
  *         any element of A or B that makes X(j) an exact solution).
  *
  * stat   (output) SuperLUStat_t*
@@ -469,7 +469,7 @@ at the top-level directory.
  * </pre>
  */
 void
-pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, 
+pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 		 ScalePermstruct_t *ScalePermstruct,
 		 double B[], int ldb, int nrhs, gridinfo_t *grid,
 		 LUstruct_t *LUstruct, double *berr,
@@ -486,7 +486,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 		                 supernodes in L.
           	   (usub, xusub) contains the compressed subscript of
 		                 nonzero segments in U.
-	      If options->Fact != SamePattern_SameRowPerm, they are 
+	      If options->Fact != SamePattern_SameRowPerm, they are
 	      computed by SYMBFACT routine, and then used by DDISTRIBUTE
 	      routine. They will be freed after DDISTRIBUTE routine.
 	      If options->Fact == SamePattern_SameRowPerm, these
@@ -578,12 +578,12 @@ pdgssvx_ABglobal(superlu_dist_options_t
 		ScalePermstruct->R = R;
 		ScalePermstruct->C = C;
 		break;
-	    case ROW: 
+	    case ROW:
 	        if ( !(C = (double *) doubleMalloc_dist(n)) )
 		    ABORT("Malloc fails for C[].");
 		ScalePermstruct->C = C;
 		break;
-	    case COL: 
+	    case COL:
 		if ( !(R = (double *) doubleMalloc_dist(m)) )
 		    ABORT("Malloc fails for R[].");
 		ScalePermstruct->R = R;
@@ -618,7 +618,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 		    for (i = colptr[j]; i < colptr[j+1]; ++i)
 			a[i] *= C[j];          /* Scale columns. */
 		break;
-	      case BOTH: 
+	      case BOTH:
 		for (j = 0; j < n; ++j) {
 		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
 			irow = rowind[i];
@@ -631,7 +631,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 	    if ( !iam ) {
 		/* Compute row and column scalings to equilibrate matrix A. */
 		dgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo);
-	    
+
 		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
 		if ( iinfo == 0 ) {
 		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
@@ -643,12 +643,12 @@ pdgssvx_ABglobal(superlu_dist_options_t
 		    if ( iinfo > 0 ) {
 			if ( iinfo <= m ) {
 #if ( PRNTlevel>=1 )
-			    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", 
+			    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n",
 				    iinfo);
 #endif
 			} else {
 #if ( PRNTlevel>=1 )
-                            fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", 
+                            fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n",
 				     iinfo-n);
 #endif
                         }
@@ -662,9 +662,9 @@ pdgssvx_ABglobal(superlu_dist_options_t
 		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
 		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
 		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
-		} 
+		}
 	    }
-	
+
             if ( iinfo == 0 ) {
 	        /* Equilibrate matrix A. */
 	        dlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed);
@@ -694,9 +694,9 @@ pdgssvx_ABglobal(superlu_dist_options_t
 	CHECK_MALLOC(iam, "Exit equil");
 #endif
     } /* end if Equil ... */
-    
+
     /* ------------------------------------------------------------
-       Permute rows of A. 
+       Permute rows of A.
        ------------------------------------------------------------*/
     if ( options->RowPerm != NO ) {
 	t = SuperLU_timer_();
@@ -704,13 +704,13 @@ pdgssvx_ABglobal(superlu_dist_options_t
 	if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */
 	    || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */
 	    for (i = 0; i < colptr[n]; ++i) {
-		    irow = rowind[i]; 
+		    irow = rowind[i];
 		    rowind[i] = perm_r[irow];
 	    }
 	} else if ( !factored ) {
 	    if ( job == 5 ) {
 		/* Allocate storage for scaling factors. */
-		if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) 
+		if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) )
 		    ABORT("SUPERLU_MALLOC fails for R1[]");
 		if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) )
 		    ABORT("SUPERLU_MALLOC fails for C1[]");
@@ -721,7 +721,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 		iinfo = dldperm_dist(job, m, nnz, colptr, rowind, a,
                                 perm_r, R1, C1);
 
-                MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );		
+                MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
 		if ( iinfo == 0 ) {
 		    MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
 		    if ( job == 5 && Equil ) {
@@ -774,7 +774,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 		    else for (i = 0; i < m; ++i) R[i] = R1[i];
 		    if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
 		    else for (i = 0; i < n; ++i) C[i] = C1[i];
-		    
+
 		    ScalePermstruct->DiagScale = BOTH;
 		    rowequ = colequ = 1;
 		} else { /* No equilibration. */
@@ -816,7 +816,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 		if ( !iam ) printf("\t product of diagonal %e\n", dprod);
 	    }
 #endif
-	    
+
         } /* else !factored */
 
 	t = SuperLU_timer_() - t;
@@ -824,7 +824,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 #if ( PRNTlevel>=1 )
 	if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
 #endif
-    
+
     } else { /* options->RowPerm == NOROWPERM */
         for (i = 0; i < m; ++i) perm_r[i] = i;
     }
@@ -846,7 +846,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 	t = SuperLU_timer_();
 	/*
 	 * Get column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
+	 *   permc_spec = NATURAL:  natural ordering
 	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
 	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
 	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
@@ -864,7 +864,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 
 	/* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */
 	ACstore = AC.Store;
-	for (j = 0; j < n; ++j) 
+	for (j = 0; j < n; ++j)
 	    for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) {
 		irow = ACstore->rowind[i];
 		ACstore->rowind[i] = perm_c[irow];
@@ -874,8 +874,8 @@ pdgssvx_ABglobal(superlu_dist_options_t
 	/* Perform a symbolic factorization on matrix A and set up the
 	   nonzero data structures which are suitable for supernodal GENP. */
 	if ( Fact != SamePattern_SameRowPerm ) {
-#if ( PRNTlevel>=1 ) 
-	    if ( !iam ) 
+#if ( PRNTlevel>=1 )
+	    if ( !iam )
 		printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
 		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
 #endif
@@ -884,23 +884,23 @@ pdgssvx_ABglobal(superlu_dist_options_t
 		   SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
 		ABORT("Malloc fails for Glu_freeable.");
 
-	    iinfo = symbfact(options, iam, &AC, perm_c, etree, 
+	    iinfo = symbfact(options, iam, &AC, perm_c, etree,
 			     Glu_persist, Glu_freeable);
 
 	    stat->utime[SYMBFAC] = SuperLU_timer_() - t;
 
 	    if ( iinfo <= 0 ) {
 		QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
-#if ( PRNTlevel>=1 ) 
+#if ( PRNTlevel>=1 )
 		if ( !iam ) {
 		    printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1);
 		    printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]);
 		    printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]);
-		    printf("\tint %d, short %d, float %d, double %d\n", 
-			   (int) sizeof(int_t), (int) sizeof(short), 
+		    printf("\tint %d, short %d, float %d, double %d\n",
+			   (int) sizeof(int_t), (int) sizeof(short),
  			   (int) sizeof(float), (int) sizeof(double));
 		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
-			   symb_mem_usage.for_lu*1e-6, 
+			   symb_mem_usage.for_lu*1e-6,
 			   symb_mem_usage.total*1e-6,
 			   symb_mem_usage.expansions);
 		}
@@ -910,7 +910,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 		if ( !iam )
 		    fprintf(stderr, "symbfact() error returns " IFMT "\n", iinfo);
 #endif
-                *info = iinfo;  
+                *info = iinfo;
                 return;
 	    }
 	}
@@ -962,14 +962,14 @@ pdgssvx_ABglobal(superlu_dist_options_t
 	    }
 	}
 #endif
-    
+
     } else if ( options->IterRefine ) { /* options->Fact==FACTORED */
 	/* Permute columns of A to form A*Pc' using the existing perm_c.
 	 * NOTE: rows of A were previously permuted to Pc*A.
 	 */
 	sp_colorder(options, A, perm_c, NULL, &AC);
     } /* if !factored ... */
-	
+
     /* ------------------------------------------------------------
        Compute the solution matrix X.
        ------------------------------------------------------------*/
@@ -979,7 +979,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 	    ABORT("Malloc fails for b_work[]");
 
 	/* ------------------------------------------------------------
-	   Scale the right-hand side if equilibration was performed. 
+	   Scale the right-hand side if equilibration was performed.
 	   ------------------------------------------------------------*/
 	if ( notran ) {
 	    if ( rowequ ) {
@@ -1057,7 +1057,7 @@ pdgssvx_ABglobal(superlu_dist_options_t
 	    x_col = &X[j*ldx];
 	    for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]];
 	}
-	
+
 	/* Transform the solution matrix X to a solution of the original system
 	   before the equilibration. */
 	if ( notran ) {
@@ -1092,10 +1092,10 @@ pdgssvx_ABglobal(superlu_dist_options_t
 	        SUPERLU_FREE(R);
 		SUPERLU_FREE(C);
 		break;
-	    case ROW: 
+	    case ROW:
 		SUPERLU_FREE(C);
 		break;
-	    case COL: 
+	    case COL:
 		SUPERLU_FREE(R);
 		break;
 	}
diff -pruN 6.1.0+dfsg1-1/SRC/pdgssvx.c 6.1.1+dfsg1-1/SRC/pdgssvx.c
--- 6.1.0+dfsg1-1/SRC/pdgssvx.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgssvx.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Solves a system of linear equations A*X=B
  *
  * <pre>
@@ -65,7 +65,7 @@ at the top-level directory.
  *                   |    .      |        |. |
  *                   |    .      |        |. |
  *                 ---------------       ------
- * 
+ *
  * where, fst_row is the row number of the first row,
  *        m_loc is the number of rows local to this processor
  * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
@@ -80,7 +80,7 @@ at the top-level directory.
  *            and its dimensions ldb (local) and nrhs (global)
  *      -  grid, a structure describing the 2D processor mesh
  *      -  options->IterRefine, which determines whether or not to
- *            improve the accuracy of the computed solution using 
+ *            improve the accuracy of the computed solution using
  *            iterative refinement
  *
  *      On output, B is overwritten with the solution X.
@@ -88,8 +88,8 @@ at the top-level directory.
  *   2. Depending on options->Fact, the user has four options
  *      for solving A*X=B. The standard option is for factoring
  *      A "from scratch". (The other options, described below,
- *      are used when A is sufficiently similar to a previously 
- *      solved problem to save time by reusing part or all of 
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
  *      the previous factorization.)
  *
  *      -  options->Fact = DOFACT: A is factored "from scratch"
@@ -118,7 +118,7 @@ at the top-level directory.
  *                             (to control numerical stability)
  *
  *      The outputs returned include
- *         
+ *
  *        o  ScalePermstruct,  modified to describe how the input matrix A
  *                             was equilibrated and permuted:
  *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
@@ -129,15 +129,15 @@ at the top-level directory.
  *          .  ScalePermstruct->perm_c, column permutation vector
  *
  *          (part of ScalePermstruct may also need to be supplied on input,
- *           depending on options->RowPerm and options->ColPerm as described 
+ *           depending on options->RowPerm and options->ColPerm as described
  *           later).
  *
  *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix diag(R)*A*diag(C)*Pc^T, where 
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
  *              Pc is the row permutation matrix determined by
  *                  ScalePermstruct->perm_c
  *              diag(R) and diag(C) are diagonal scaling matrices determined
- *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
  *                  ScalePermstruct->C
  *
  *        o  LUstruct, which contains the L and U factorization of A1 where
@@ -149,7 +149,7 @@ at the top-level directory.
  *
  *   3. The second value of options->Fact assumes that a matrix with the same
  *      sparsity pattern as A has already been factored:
- *     
+ *
  *      -  options->Fact = SamePattern: A is factored, assuming that it has
  *            the same nonzero pattern as a previously factored matrix. In
  *            this case the algorithm saves time by reusing the previously
@@ -166,14 +166,14 @@ at the top-level directory.
  *
  *      but not options->ColPerm, whose value is ignored. This is because the
  *      previous column permutation from ScalePermstruct->perm_c is used as
- *      input. The user must also supply 
+ *      input. The user must also supply
  *
  *        o  A, the input matrix
  *        o  ScalePermstruct->perm_c, the column permutation
  *        o  LUstruct->etree, the elimination tree
  *
  *      The outputs returned include
- *         
+ *
  *        o  A, the input matrix A overwritten by the scaled and permuted
  *              matrix as described above
  *        o  ScalePermstruct, modified to describe how the input matrix A was
@@ -201,25 +201,25 @@ at the top-level directory.
  *      ignored. This is because the permutations from ScalePermstruct->perm_r
  *      and ScalePermstruct->perm_c are used as input.
  *
- *      The user must also supply 
+ *      The user must also supply
  *
  *        o  A, the input matrix
  *        o  ScalePermstruct->DiagScale, how the previous matrix was row
  *                                       and/or column scaled
  *        o  ScalePermstruct->R, the row scalings of the previous matrix,
  *                               if any
- *        o  ScalePermstruct->C, the columns scalings of the previous matrix, 
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
  *                               if any
  *        o  ScalePermstruct->perm_r, the row permutation of the previous
  *                                    matrix
- *        o  ScalePermstruct->perm_c, the column permutation of the previous 
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
  *                                    matrix
  *        o  all of LUstruct, the previously computed information about
  *                            L and U (the actual numerical values of L and U
  *                            stored in LUstruct->Llu are ignored)
  *
  *      The outputs returned include
- *         
+ *
  *        o  A, the input matrix A overwritten by the scaled and permuted
  *              matrix as described above
  *        o  ScalePermstruct,  modified to describe how the input matrix A was
@@ -228,7 +228,7 @@ at the top-level directory.
  *        o  LUstruct, modified to contain the new L and U factors
  *
  *   5. The fourth and last value of options->Fact assumes that A is
- *      identical to a matrix that has already been factored on a previous 
+ *      identical to a matrix that has already been factored on a previous
  *      call, and reuses its entire LU factorization
  *
  *      -  options->Fact = Factored: A is identical to a previously
@@ -236,10 +236,10 @@ at the top-level directory.
  *            can be reused.
  *
  *      In this case all the other options mentioned above are ignored
- *      (options->Equil, options->RowPerm, options->ColPerm, 
+ *      (options->Equil, options->RowPerm, options->ColPerm,
  *       options->ReplaceTinyPivot)
  *
- *      The user must also supply 
+ *      The user must also supply
  *
  *        o  A, the unfactored matrix, only in the case that iterative
  *              refinement is to be done (specifically A must be the output
@@ -249,7 +249,7 @@ at the top-level directory.
  *           L and U
  *
  *      all of which are unmodified on output.
- *         
+ *
  * Arguments
  * =========
  *
@@ -257,7 +257,7 @@ at the top-level directory.
  *         The structure defines the input parameters to control
  *         how the LU decomposition will be performed.
  *         The following fields should be defined for this structure:
- *         
+ *
  *         o Fact (fact_t)
  *           Specifies whether or not the factored form of the matrix
  *           A is supplied on entry, and if not, how the matrix A should
@@ -267,7 +267,7 @@ at the top-level directory.
  *                 Inputs:  A
  *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          all of ScalePermstruct
  *                          all of LUstruct
@@ -275,7 +275,7 @@ at the top-level directory.
  *           = SamePattern: the matrix A will be factorized assuming
  *             that a factorization of a matrix with the same sparsity
  *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
+ *             factorization will reuse column permutation vector
  *             ScalePermstruct->perm_c and the elimination tree
  *             LUstruct->etree
  *                 Inputs:  A
@@ -283,7 +283,7 @@ at the top-level directory.
  *                          ScalePermstruct->perm_c
  *                          LUstruct->etree
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
  *                          rest of LUstruct (GLU_persist, Llu)
@@ -301,7 +301,7 @@ at the top-level directory.
  *                          all of ScalePermstruct
  *                          all of LUstruct
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          modified LUstruct->Llu
  *           = FACTORED: the matrix A is already factored.
@@ -327,20 +327,20 @@ at the top-level directory.
  *           = LargeDiag_APWM: use the parallel approximate-weight perfect
  *                        matching to permute rows of the original matrix
  *                        to make the diagonal large relative to the
- *                        off-diagonal.								   
+ *                        off-diagonal.
  *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
  *                        input by the user.
- *           
+ *
  *         o ColPerm (colperm_t)
  *           Specifies what type of column permutation to use to reduce fill.
  *           = NATURAL:       natural ordering.
  *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
  *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
  *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
- *         
+ *
  *         o ReplaceTinyPivot (yes_no_t)
  *           = NO:  do not modify pivots
- *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
  *                  LU factorization.
  *
  *         o IterRefine (IterRefine_t)
@@ -381,7 +381,7 @@ at the top-level directory.
  *                      diag(R).
  *           = COL:     Column equilibration, i.e., A was postmultiplied
  *                      by diag(C).
- *           = BOTH:    both row and column equilibration, i.e., A was 
+ *           = BOTH:    both row and column equilibration, i.e., A was
  *                      replaced by diag(R)*A*diag(C).
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
  *           DiagScale is an input argument; otherwise it is an output
@@ -395,8 +395,8 @@ at the top-level directory.
  *           input argument; otherwise it is an output argument.
  *
  *         o perm_c (int*)
- *           Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
  *           in position j in A*Pc.
  *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
  *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
@@ -408,7 +408,7 @@ at the top-level directory.
  *
  *         o R (double*) dimension (A->nrow)
  *           The row scale factors for A.
- *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
  *                          diag(R).
  *           If DiagScale = NOEQUIL or COL, R is not defined.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
@@ -416,12 +416,12 @@ at the top-level directory.
  *
  *         o C (double*) dimension (A->ncol)
  *           The column scale factors for A.
- *           If DiagScale = COL or BOTH, A is multiplied on the right by 
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
  *                          diag(C).
  *           If DiagScale = NOEQUIL or ROW, C is not defined.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
  *           an input argument; otherwise, C is an output argument.
- *         
+ *
  * B       (input/output) double* (local)
  *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
  *           where, m_loc is the number of rows stored locally on my
@@ -480,8 +480,8 @@ at the top-level directory.
  *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
  *
  * berr    (output) double*, dimension (nrhs) (global)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
  *         any element of A or B that makes X(j) an exact solution).
  *
  * stat   (output) SuperLUStat_t*
@@ -502,7 +502,7 @@ at the top-level directory.
  */
 
 void
-pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, 
+pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	ScalePermstruct_t *ScalePermstruct,
 	double B[], int ldb, int nrhs, gridinfo_t *grid,
 	LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
@@ -522,7 +522,7 @@ pdgssvx(superlu_dist_options_t *options,
 		                 supernodes in L.
           	   (usub, xusub) contains the compressed subscript of
 		                 nonzero segments in U.
-	      If options->Fact != SamePattern_SameRowPerm, they are 
+	      If options->Fact != SamePattern_SameRowPerm, they are
 	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
 	      routine. They will be freed after PDDISTRIBUTE routine.
 	      If options->Fact == SamePattern_SameRowPerm, these
@@ -553,7 +553,7 @@ pdgssvx(superlu_dist_options_t *options,
     int_t nsupers,nsupers_j;
     int_t lk,k,knsupc,nsupr;
     int_t  *lsub,*xsup;
-    double *lusup;	
+    double *lusup;
 #if ( PRNTlevel>= 2 )
     double   dmin, dsum, dprod;
 #endif
@@ -567,7 +567,7 @@ pdgssvx(superlu_dist_options_t *options,
     int   col, key; /* parameters for creating a new communicator */
     Pslu_freeable_t Pslu_freeable;
     float  flinfo;
-	
+
     /* Initialization. */
     m       = A->nrow;
     n       = A->ncol;
@@ -583,7 +583,7 @@ pdgssvx(superlu_dist_options_t *options,
     symb_comm = MPI_COMM_NULL;
     num_mem_usage.for_lu = num_mem_usage.total = 0.0;
     symb_mem_usage.for_lu = symb_mem_usage.total = 0.0;
-	
+
     /* Test the input parameters. */
     *info = 0;
     Fact = options->Fact;
@@ -620,7 +620,7 @@ pdgssvx(superlu_dist_options_t *options,
     Equil = (!factored && options->Equil == YES);
     notran = (options->Trans == NOTRANS);
     parSymbFact = options->ParSymbFact;
-	
+
     iam = grid->iam;
     job = 5;
     if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
@@ -643,7 +643,7 @@ pdgssvx(superlu_dist_options_t *options,
 #endif
 
     /* Not factored & ask for equilibration */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) { 
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
 	/* Allocate storage if not done so before. */
 	switch ( ScalePermstruct->DiagScale ) {
 	    case NOEQUIL:
@@ -654,12 +654,12 @@ pdgssvx(superlu_dist_options_t *options,
 		ScalePermstruct->R = R;
 		ScalePermstruct->C = C;
 		break;
-	    case ROW: 
+	    case ROW:
 	        if ( !(C = (double *) doubleMalloc_dist(n)) )
 		    ABORT("Malloc fails for C[].");
 		ScalePermstruct->C = C;
 		break;
-	    case COL: 
+	    case COL:
 		if ( !(R = (double *) doubleMalloc_dist(m)) )
 		    ABORT("Malloc fails for R[].");
 		ScalePermstruct->R = R;
@@ -728,7 +728,7 @@ pdgssvx(superlu_dist_options_t *options,
 
 	    /* Now iinfo == 0 */
 
-            /* Equilibrate matrix A if it is badly-scaled. 
+            /* Equilibrate matrix A if it is badly-scaled.
                A <-- diag(R)*A*diag(C)                     */
 	    pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed);
 
@@ -795,7 +795,7 @@ pdgssvx(superlu_dist_options_t *options,
 	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
 	            /* Permute the global matrix GA for symbfact() */
 	            for (i = 0; i < colptr[n]; ++i) {
-	            	irow = rowind[i]; 
+	            	irow = rowind[i];
 		    	rowind[i] = perm_r[irow];
 	            }
 	        } else if ( options->RowPerm == LargeDiag_MC64 ) {
@@ -811,7 +811,7 @@ pdgssvx(superlu_dist_options_t *options,
 	            if ( !iam ) { /* Process 0 finds a row permutation */
 		        iinfo = dldperm_dist(job, m, nnz, colptr, rowind, a_GA,
 		                perm_r, R1, C1);
-		
+
                         MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
 		        if ( iinfo == 0 ) {
 		            MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
@@ -875,7 +875,7 @@ pdgssvx(superlu_dist_options_t *options,
 		            else for (i = 0; i < m; ++i) R[i] = R1[i];
 		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
 		            else for (i = 0; i < n; ++i) C[i] = C1[i];
-		    
+
 		            ScalePermstruct->DiagScale = BOTH;
 		            rowequ = colequ = 1;
 
@@ -951,14 +951,14 @@ pdgssvx(superlu_dist_options_t *options,
     }
 
     /* ------------------------------------------------------------
-       Perform the LU factorization: symbolic factorization, 
+       Perform the LU factorization: symbolic factorization,
        redistribution, and numerical factorization.
        ------------------------------------------------------------*/
     if ( !factored ) {
 	t = SuperLU_timer_();
 	/*
 	 * Get column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
+	 *   permc_spec = NATURAL:  natural ordering
 	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
 	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
 	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
@@ -982,7 +982,7 @@ pdgssvx(superlu_dist_options_t *options,
 		if ( permc_spec == NATURAL ) {
 		     for (j = 0; j < n; ++j) perm_c[j] = j;
                 }
-		if ( !(sizes = intMalloc_dist(2 * noDomains)) ) 
+		if ( !(sizes = intMalloc_dist(2 * noDomains)) )
 		     ABORT("SUPERLU_MALLOC fails for sizes.");
 		if ( !(fstVtxSep = intMalloc_dist(2 * noDomains)) )
 		    ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
@@ -1001,10 +1001,10 @@ pdgssvx(superlu_dist_options_t *options,
 	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
           /* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */
 	  if ( permc_spec == PARMETIS ) {
-	// #pragma omp parallel  
-    // {  	
+	// #pragma omp parallel
+    // {
 	// #pragma omp master
-	// {	
+	// {
 	      /* Get column permutation vector in perm_c.                    *
 	       * This routine takes as input the distributed input matrix A  *
 	       * and does not modify it.  It also allocates memory for       *
@@ -1038,9 +1038,9 @@ pdgssvx(superlu_dist_options_t *options,
 	        /* Compute the elimination tree of Pc*(A^T+A)*Pc^T or Pc*A^T*A*Pc^T
 	           (a.k.a. column etree), depending on the choice of ColPerm.
 	           Adjust perm_c[] to be consistent with a postorder of etree.
-	           Permute columns of A to form A*Pc'. 
+	           Permute columns of A to form A*Pc'.
 		   After this routine, GAC = GA*Pc^T.  */
-	        sp_colorder(options, &GA, perm_c, etree, &GAC); 
+	        sp_colorder(options, &GA, perm_c, etree, &GAC);
 
 	        /* Form Pc*A*Pc^T to preserve the diagonal of the matrix GAC. */
 	        GACstore = (NCPformat *) GAC.Store;
@@ -1056,7 +1056,7 @@ pdgssvx(superlu_dist_options_t *options,
 
 	        /* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up
                    the nonzero data structures for L & U. */
-#if ( PRNTlevel>=1 ) 
+#if ( PRNTlevel>=1 )
                 if ( !iam ) {
 		    printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
 		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
@@ -1069,7 +1069,7 @@ pdgssvx(superlu_dist_options_t *options,
 		    ABORT("Malloc fails for Glu_freeable.");
 
 	    	/* Every process does this. */
-	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree, 
+	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree,
 			     	 Glu_persist, Glu_freeable);
 			nnzLU = Glu_freeable->nnzLU;
 	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
@@ -1080,11 +1080,11 @@ pdgssvx(superlu_dist_options_t *options,
 		    	printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1);
 		    	printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]);
 		    	printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]);
-		    	printf("\tint %d, short %d, float %d, double %d\n", 
+		    	printf("\tint %d, short %d, float %d, double %d\n",
 			       (int) sizeof(int_t), (int) sizeof(short),
         		       (int) sizeof(float), (int) sizeof(double));
 		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
-			   	symb_mem_usage.for_lu*1e-6, 
+			   	symb_mem_usage.for_lu*1e-6,
 			   	symb_mem_usage.total*1e-6,
 			   	symb_mem_usage.expansions);
 			fflush(stdout);
@@ -1102,9 +1102,9 @@ pdgssvx(superlu_dist_options_t *options,
 	    else {  /* parallel symbolic factorization */
 	    	t = SuperLU_timer_();
 	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
-				       sizes, fstVtxSep, &Pslu_freeable, 
+				       sizes, fstVtxSep, &Pslu_freeable,
 				       &(grid->comm), &symb_comm,
-				       &symb_mem_usage); 
+				       &symb_mem_usage);
 			nnzLU = Pslu_freeable.nnzLU;
 	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
 	    	if (flinfo > 0) {
@@ -1126,7 +1126,7 @@ pdgssvx(superlu_dist_options_t *options,
 
         if (sizes) SUPERLU_FREE (sizes);
         if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
-	if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); 
+	if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm);
 
 	/* Distribute entries of A into L & U data structures. */
 	//if (parSymbFact == NO || ???? Fact == SamePattern_SameRowPerm) {
@@ -1135,7 +1135,7 @@ pdgssvx(superlu_dist_options_t *options,
   	    /* Apply column permutation to the original distributed A */
 	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
 
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc^T into L and U storage. 
+	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc^T into L and U storage.
 	       NOTE: the row permutation Pc*Pr is applied internally in the
   	       distribution routine. */
 	    t = SuperLU_timer_();
@@ -1149,7 +1149,7 @@ pdgssvx(superlu_dist_options_t *options,
 	        SUPERLU_FREE(Glu_freeable);
 	    }
 	} else { /* CASE OF PARALLEL SYMBOLIC */
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
+	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
 	       NOTE: the row permutation Pc*Pr is applied internally in the
 	       distribution routine. */
 	    /* Apply column permutation to the original distributed A */
@@ -1160,7 +1160,7 @@ pdgssvx(superlu_dist_options_t *options,
 		  			   &Pslu_freeable, LUstruct, grid);
 	    if (dist_mem_use > 0)
 	        ABORT ("Not enough memory available for dist_psymbtonum\n");
-            
+
 	    stat->utime[DIST] = SuperLU_timer_() - t;
 	}
 
@@ -1168,16 +1168,16 @@ pdgssvx(superlu_dist_options_t *options,
 
 	/* Perform numerical factorization in parallel. */
 	t = SuperLU_timer_();
-    // #pragma omp parallel  
-    // {  	
+    // #pragma omp parallel
+    // {
 	// #pragma omp master
 	// {
 	pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
 	stat->utime[FACT] = SuperLU_timer_() - t;
 	// }
 	// }
-	
-	
+
+
 #if ( PRNTlevel>=2 )
     /* ------------------------------------------------------------
        SUM OVER ALL ENTRIES OF A AND PRINT NNZ AND SIZE OF A.
@@ -1193,14 +1193,14 @@ pdgssvx(superlu_dist_options_t *options,
 	    asum += nzval_a[j];
 	}
     }
-	
+
 	nsupers = Glu_persist->supno[n-1] + 1;
 	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
-	
-	
-	
+
+
+
 	lsum=0.0;
-	for (lk=0;lk<nsupers_j;++lk){	
+	for (lk=0;lk<nsupers_j;++lk){
 		lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
 		lusup = LUstruct->Llu->Lnzval_bc_ptr[lk];
 		if(lsub){
@@ -1208,21 +1208,21 @@ pdgssvx(superlu_dist_options_t *options,
 			knsupc = SuperSize( k );
 			nsupr = lsub[1];
 			for (j=0; j<knsupc; ++j)
-				for (i = 0; i < nsupr; ++i) 
+				for (i = 0; i < nsupr; ++i)
 					lsum +=lusup[j*nsupr+i];
 		}
 	}
-	
-	
+
+
 	MPI_Allreduce( &asum, &asum_tot,1, MPI_DOUBLE, MPI_SUM, grid->comm );
 	MPI_Allreduce( &lsum, &lsum_tot,1, MPI_DOUBLE, MPI_SUM, grid->comm );
-	
+
 
 	MPI_Allreduce( &Astore->rowptr[Astore->m_loc], &nnz_tot,1, mpi_int_t, MPI_SUM, grid->comm );
 	// MPI_Bcast( &nnzLU, 1, mpi_int_t, 0, grid->comm );
-	
+
 	MPI_Comm_rank( MPI_COMM_WORLD, &iam_g );
-	
+
     if (!iam_g) {
 	print_options_dist(options);
 	fflush(stdout);
@@ -1230,8 +1230,8 @@ pdgssvx(superlu_dist_options_t *options,
 
     printf(".. Ainfo mygid %5d   mysid %5d   nnz_loc " IFMT "  sum_loc  %e lsum_loc   %e nnz " IFMT " nnzLU %ld sum %e  lsum %e  N " IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum, lsum, nnz_tot,nnzLU,asum_tot,lsum_tot,A->ncol);
 	fflush(stdout);
-#endif				
-			
+#endif
+
 #if 0
 
 // #ifdef GPU_PROF
@@ -1242,7 +1242,7 @@ pdgssvx(superlu_dist_options_t *options,
 
 //      ttemp = getenv("IO_FILE");
 //      if(ttemp!=NULL)
-//      {   
+//      {
 //          printf("File being opend is %s\n",ttemp );
 //          FILE* fp;
 //          fp = fopen(ttemp,"w");
@@ -1292,7 +1292,7 @@ pdgssvx(superlu_dist_options_t *options,
                              num_mem_usage.for_lu  /* distribution step */
                        );
             }
-            
+
 	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
 
 	    MPI_Reduce( &temp, &max,
@@ -1315,17 +1315,17 @@ pdgssvx(superlu_dist_options_t *options,
 		       for_lu * 1e-6, total * 1e-6);
                 printf("** Total highmark (MB):\n"
 		       "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
-		       avg * 1e-6,  
+		       avg * 1e-6,
 		       avg / grid->nprow / grid->npcol * 1e-6,
 		       max * 1e-6);
 		printf("**************************************************\n");
 		fflush(stdout);
             }
 	} /* end printing stats */
-    
+
     } /* end if (!factored) */
 
-    
+
     if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
 	/* Need to reset the solve's communication pattern,
 	   because perm_r[] and/or perm_c[] is changed.    */
@@ -1338,11 +1338,11 @@ pdgssvx(superlu_dist_options_t *options,
     /* Need to revisit: Why the following is not good enough for X-to-B
        distribution -- inv_perm_c changed */
 	pxgstrs_finalize(SOLVEstruct->gstrs_comm);
-	pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
+	pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid,
 	             LUstruct->Glu_persist, SOLVEstruct);
 #endif
 
-	
+
     /* ------------------------------------------------------------
        Compute the solution matrix X.
        ------------------------------------------------------------*/
@@ -1352,7 +1352,7 @@ pdgssvx(superlu_dist_options_t *options,
 	    ABORT("Malloc fails for b_work[]");
 
 	/* ------------------------------------------------------------
-	   Scale the right-hand side if equilibration was performed. 
+	   Scale the right-hand side if equilibration was performed.
 	   ------------------------------------------------------------*/
 	if ( notran ) {
 	    if ( rowequ ) {
@@ -1403,22 +1403,22 @@ pdgssvx(superlu_dist_options_t *options,
 	       factorization with Fact == DOFACT or SamePattern is asked for. */
 	}
 
-	if ( options->DiagInv==YES && 
+	if ( options->DiagInv==YES &&
              (options->SolveInitialized == NO || Fact == SamePattern ||
               Fact == SamePattern_SameRowPerm) ) {
 	    pdCompute_Diag_Inv(n, LUstruct, grid, stat, info);
 	}
 
 
-    // #pragma omp parallel  
-    // {  	
+    // #pragma omp parallel
+    // {
 	// #pragma omp master
 	// {
-	pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, 
+	pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
 		fst_row, ldb, nrhs, SOLVEstruct, stat, info);
 	// }
 	// }
-	
+
 	/* ------------------------------------------------------------
 	   Use iterative refinement to improve the computed solution and
 	   compute error bounds and backward error estimates for it.
@@ -1437,7 +1437,7 @@ pdgssvx(superlu_dist_options_t *options,
 		    pdgsmv_finalize(SOLVEstruct->gsmv_comm);
 	        pdgsmv_init(A, SOLVEstruct->row_to_proc, grid,
 			    SOLVEstruct->gsmv_comm);
-	       
+
                 /* Save a copy of the transformed local col indices
 		   in colind_gsmv[]. */
 	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
@@ -1463,7 +1463,7 @@ pdgssvx(superlu_dist_options_t *options,
 		        }
 		    }
 	        }
-	      
+
 	        /* Re-use the local col indices of A obtained from the
 		   previous call to pdgsmv_init() */
 	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
@@ -1473,10 +1473,10 @@ pdgssvx(superlu_dist_options_t *options,
 	        SOLVEstruct1 = SOLVEstruct;
 	    } else { /* For nrhs > 1, since refinement is performed for RHS
 			one at a time, the communication structure for pdgstrs
-			is different than the solve with nrhs RHS. 
+			is different than the solve with nrhs RHS.
 			So we use SOLVEstruct1 for the refinement step.
 		     */
-	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *) 
+	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *)
 		                       SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) )
 		    ABORT("Malloc fails for SOLVEstruct1");
 	        /* Copy the same stuff */
@@ -1487,12 +1487,12 @@ pdgssvx(superlu_dist_options_t *options,
 	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
 	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
 	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
-		
+
 		/* Initialize the *gstrs_comm for 1 RHS. */
 		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
 		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
 		    ABORT("Malloc fails for gstrs_comm[]");
-		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, 
+		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid,
 			     Glu_persist, SOLVEstruct1);
 	    }
 
@@ -1517,7 +1517,7 @@ pdgssvx(superlu_dist_options_t *options,
 	for (i = 0; i < m_loc; ++i)
 	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
 #endif
-	
+
 	/* Transform the solution matrix X to a solution of the original
 	   system before equilibration. */
 	if ( notran ) {
@@ -1560,10 +1560,10 @@ pdgssvx(superlu_dist_options_t *options,
 	        SUPERLU_FREE(R);
 		SUPERLU_FREE(C);
 		break;
-	    case ROW: 
+	    case ROW:
 		SUPERLU_FREE(C);
 		break;
-	    case COL: 
+	    case COL:
 		SUPERLU_FREE(R);
 		break;
 	}
diff -pruN 6.1.0+dfsg1-1/SRC/pdgstrf2.c 6.1.1+dfsg1-1/SRC/pdgstrf2.c
--- 6.1.0+dfsg1-1/SRC/pdgstrf2.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgstrf2.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Performs panel LU factorization.
  *
  * <pre>
@@ -129,7 +129,7 @@ pdgstrf2_trsm
     u_diag_cnt = 0;
     incy = ld_ujrow;
 
-    if ( U_diag_blk_send_req && 
+    if ( U_diag_blk_send_req &&
 	 U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) {
         /* There are pending sends - wait for all Isend to complete */
 #if ( PROFlevel>=1 )
@@ -259,7 +259,7 @@ pdgstrf2_trsm
 	stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * l;
     } else {  /* non-diagonal process */
         /* ================================================================== *
-         * Receive the diagonal block of U for panel factorization of L(:,k). * 
+         * Receive the diagonal block of U for panel factorization of L(:,k). *
          * Note: we block for panel factorization of L(:,k), but panel        *
 	 * factorization of U(:,k) do not block                               *
          * ================================================================== */
@@ -358,7 +358,7 @@ void pdgstrs2_omp
     nb = usub[0];
     iukp = BR_HEADER;
     rukp = 0;
-    
+
     int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int));
     int* blocks_value_pointers = blocks_index_pointers + nb;
     int* nsupc_temp = blocks_value_pointers + nb;
diff -pruN 6.1.0+dfsg1-1/SRC/pdgstrf.c 6.1.1+dfsg1-1/SRC/pdgstrf.c
--- 6.1.0+dfsg1-1/SRC/pdgstrf.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgstrf.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,9 +1,9 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
@@ -28,11 +28,12 @@ at the top-level directory.
  *   December 31, 2015 rename xMACH to xMACH_DIST.
  *   September 30, 2017 optimization for Intel Knights Landing (KNL) node .
  *   June 1, 2018      add parallel AWPM pivoting; add back arrive_at_ublock()
+ *   February 8, 2019  version 6.1.1
  *
- * Sketch of the algorithm 
+ * Sketch of the algorithm
+ *
+ * =======================
  *
- * ======================= 
- *    
  * The following relations hold:
  *     * A_kk = L_kk * U_kk
  *     * L_ik = Aik * U_kk^(-1)
@@ -116,25 +117,25 @@ at the top-level directory.
 /*#include "cublas_dgemm.h"*/
 // #define NUM_CUDA_STREAMS 16
 // #define NUM_CUDA_STREAMS 16
-#endif 
+#endif
 
 /* Various defininations     */
-/* 
-    Name    : SUPERNODE_PROFILE  
+/*
+    Name    : SUPERNODE_PROFILE
     Purpose : For SuperNode Level profiling of various measurements such as gigaflop/sec
     obtained,bandwidth achieved:
-    Overhead : Low 
+    Overhead : Low
 */
-// #define SUPERNODE_PROFILE   
+// #define SUPERNODE_PROFILE
 
-/* 
+/*
     Name    :   BAELINE
     Purpose : baseline to compare performance against
     Overhead : NA : this won't be used for running experiments
 */
 // #define BASELINE
 
-/* 
+/*
     Name    :   PHI_FRAMEWORK
     Purpose : To simulate and test algorithm used for offloading Phi
     Overhead : NA : this won't be used for running experiments
@@ -412,12 +413,12 @@ pdgstrf(superlu_dist_options_t * options
     if (m == 0 || n == 0) return 0;
 
     double tt1 = SuperLU_timer_ ();
- 
-    /* 
-     * Initialization.  
+
+    /*
+     * Initialization.
      */
     iam = grid->iam;
-    Pc = grid->npcol; 
+    Pc = grid->npcol;
     Pr = grid->nprow;
     myrow = MYROW (iam, grid);
     mycol = MYCOL (iam, grid);
@@ -426,7 +427,7 @@ pdgstrf(superlu_dist_options_t * options
     s_eps = smach_dist("Epsilon");
     thresh = s_eps * anorm;
 
-    MPI_Attr_get (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag);
+    MPI_Comm_get_attr (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag);
     if (!flag) {
         fprintf (stderr, "Could not get TAG_UB\n");
         return (-1);
@@ -504,9 +505,9 @@ pdgstrf(superlu_dist_options_t * options
         }
     }
 
-    log_memory( (Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) 
+    log_memory( (Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1)
 		* iword +
-		(Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) 
+		(Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1)
 		* dword, stat );
 
     /* creating pointers to the look-ahead buffers */
@@ -626,7 +627,7 @@ pdgstrf(superlu_dist_options_t * options
 
 #if ( DEBUGlevel >= 2 )
     PrintInt10("schedule:perm_c_supno", nsupers, perm_c_supno);
-    
+
     /* Turn off static schedule */
     printf("[%d] .. Turn off static schedule for debugging ..\n", iam);
     for (i = 0; i < nsupers; ++i) perm_c_supno[i] = iperm_c_supno[i] = i;
@@ -642,7 +643,7 @@ pdgstrf(superlu_dist_options_t * options
     for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1; /* vectorized */
     log_memory(3 * nsupers * iword, stat);
 
-    /* Sherry: omp parallel? 
+    /* Sherry: omp parallel?
        not worth doing, due to concurrent write to look_ahead_l[jb] */
     for (lb = 0; lb < nrb; ++lb) { /* go through U-factor */
         ib = lb * Pr + myrow;
@@ -739,12 +740,12 @@ pdgstrf(superlu_dist_options_t * options
         fflush(stdout);
     }
 #endif
-   
+
     Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
     Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
     Unzval_br_ptr = Llu->Unzval_br_ptr;
-    ToRecv = Llu->ToRecv; 
+    ToRecv = Llu->ToRecv;
     ToSendD = Llu->ToSendD;
     ToSendR = Llu->ToSendR;
 
@@ -757,7 +758,7 @@ pdgstrf(superlu_dist_options_t * options
 
 #if 0
 #if defined _OPENMP  // Sherry: parallel reduction -- seems slower?
-#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub) 
+#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub)
 #endif
 #endif
     for (int i = mycol; i < nsupers; i += Pc) { /* grab my local columns */
@@ -778,7 +779,7 @@ pdgstrf(superlu_dist_options_t * options
     /* int_t buffer_size =
          SUPERLU_MAX (max_row_size * num_threads * ldt,
                       get_max_buffer_size ());           */
-            
+
 #ifdef GPU_ACC
     int cublas_nb = get_cublas_nb();
     int nstreams = get_num_cuda_streams ();
@@ -817,11 +818,11 @@ pdgstrf(superlu_dist_options_t * options
     /* bigU and bigV are either on CPU or on GPU, not both. */
     double* bigU; /* for storing entire U(k,:) panel, prepare for GEMM.
                       bigU has the same size either on CPU or on CPU. */
-    double* bigV; /* for storing GEMM output matrix, i.e. update matrix. 
+    double* bigV; /* for storing GEMM output matrix, i.e. update matrix.
 	              bigV is large to hold the aggregate GEMM output.*/
     bigU = NULL;
     bigV = NULL;
-				  
+
 #if ( PRNTlevel>=1 )
     if(!iam) {
 	printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
@@ -842,7 +843,7 @@ pdgstrf(superlu_dist_options_t * options
 #endif
     if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(double) ,cudaHostAllocDefault)) )
         ABORT("Malloc fails for dgemm buffer V");
- 
+
     DisplayHeader();
 
 #if ( PRNTlevel>=1 )
@@ -853,19 +854,19 @@ pdgstrf(superlu_dist_options_t * options
     handle = (cublasHandle_t *) SUPERLU_MALLOC(sizeof(cublasHandle_t)*nstreams);
     for(int i = 0; i < nstreams; i++) handle[i] = create_handle();
 
-    // creating streams 
+    // creating streams
     cudaStream_t *streams;
     streams = (cudaStream_t *) SUPERLU_MALLOC(sizeof(cudaStream_t)*nstreams);
     for (int i = 0; i < nstreams; ++i)
         checkCuda( cudaStreamCreate(&streams[i]) );
-    
-    // allocating data in device 
+
+    // allocating data in device
     double *dA, *dB, *dC;
     cudaError_t cudaStat;
 #if 0
     // cudaStat = cudaMalloc( (void**)&dA, m*k*sizeof(double));
     // HOw much should be the size of dA?
-    // for time being just making it 
+    // for time being just making it
     // cudaStat = cudaMalloc( (void**)&dA, ((max_row_size*sp_ienv_dist(3)))* sizeof(double));
 #endif
 
@@ -889,11 +890,11 @@ pdgstrf(superlu_dist_options_t * options
         return 1;
     }
 
-    stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3) 
+    stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3)
 			  + bigu_size + buffer_size ) * dword;
 
 #else  /* not CUDA */
-    
+
     // for GEMM padding 0
     j = bigu_size / ldt;
     bigu_size += (gemm_k_pad * (j + ldt + gemm_n_pad));
@@ -904,7 +905,7 @@ pdgstrf(superlu_dist_options_t * options
 //    bigV = _mm_malloc(bigv_size * sizeof(double), 1<<12);
 //#else
     if ( !(bigU = doubleMalloc_dist(bigu_size)) )
-        ABORT ("Malloc fails for dgemm U buffer"); 
+        ABORT ("Malloc fails for dgemm U buffer");
           //Maximum size of bigU= sqrt(buffsize) ?
     // int bigv_size = 8 * ldt * ldt * num_threads;
     if ( !(bigV = doubleMalloc_dist(bigv_size)) )
@@ -915,7 +916,7 @@ pdgstrf(superlu_dist_options_t * options
 
     log_memory((bigv_size + bigu_size) * dword, stat);
 
-    // mlock(bigU,(bigu_size) * sizeof (double));   
+    // mlock(bigU,(bigu_size) * sizeof (double));
 
 #if ( PRNTlevel>=1 )
     if(!iam) {
@@ -951,7 +952,7 @@ pdgstrf(superlu_dist_options_t * options
 
     int_t mrb = (nsupers + Pr - 1) / Pr;
     int_t mcb = (nsupers + Pc - 1) / Pc;
-    
+
     RemainStRow     = intMalloc_dist(mrb);
 #if 0
     Remain_lptr     = (int *) _mm_malloc(sizeof(int)*mrb,1);
@@ -960,7 +961,7 @@ pdgstrf(superlu_dist_options_t * options
 #endif
     // mlock(Remain_lptr, sizeof(int)*mrb );
     Remain_ib       = intMalloc_dist(mrb);
-    
+
     Remain_info_t *Remain_info;
 #if 0
     Remain_info = (Remain_info_t *) _mm_malloc(mrb*sizeof(Remain_info_t),64);
@@ -1017,7 +1018,7 @@ pdgstrf(superlu_dist_options_t * options
         PDGSTRF2 (options, k0, k, thresh, Glu_persist, grid, Llu,
                   U_diag_blk_send_req, tag_ub, stat, info);
 
-        pdgstrf2_timer += SuperLU_timer_()-ttt1; 
+        pdgstrf2_timer += SuperLU_timer_()-ttt1;
 
         scp = &grid->rscp;      /* The scope of process row. */
 
@@ -1142,7 +1143,7 @@ pdgstrf(superlu_dist_options_t * options
                     PDGSTRF2 (options, kk0, kk, thresh, Glu_persist,
                               grid, Llu, U_diag_blk_send_req, tag_ub, stat, info);
 
-                     pdgstrf2_timer += SuperLU_timer_() - ttt1; 
+                     pdgstrf2_timer += SuperLU_timer_() - ttt1;
 
                     /* Multicasts numeric values of L(:,kk) to process rows. */
                     /* ttt1 = SuperLU_timer_(); */
@@ -1243,7 +1244,7 @@ pdgstrf(superlu_dist_options_t * options
         kk1 = k0;
         kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
         for (kk0 = kk1; kk0 < kk2; kk0++) {
-            kk = perm_c_supno[kk0]; /* order determined from static schedule */  
+            kk = perm_c_supno[kk0]; /* order determined from static schedule */
             if (factoredU[kk0] != 1 && look_ahead[kk] < k0) {
 		/* does not depend on current column k */
                 kcol = PCOL (kk, grid);
@@ -1309,7 +1310,7 @@ pdgstrf(superlu_dist_options_t * options
                             PDGSTRS2 (kk0, kk, Glu_persist, grid, Llu,
                                       stat);
                         }
-    
+
                         pdgstrs2_timer += SuperLU_timer_()-ttt2;
                         /* stat->time8 += SuperLU_timer_()-ttt2; */
 
@@ -1415,7 +1416,7 @@ pdgstrf(superlu_dist_options_t * options
                 } else {
                     msgcnt[0] = msgcntsU[look_id][0];
 #if (DEBUGlevel>=2)
-		    printf("\t[%d] k=%d, look_id=%d, recv_req[0] == MPI_REQUEST_NULL, msgcnt[0] = %d\n", 
+		    printf("\t[%d] k=%d, look_id=%d, recv_req[0] == MPI_REQUEST_NULL, msgcnt[0] = %d\n",
 			   iam, k, look_id, msgcnt[0]);
 #endif
                 }
@@ -1427,7 +1428,7 @@ pdgstrf(superlu_dist_options_t * options
                 } else {
                     msgcnt[1] = msgcntsU[look_id][1];
 #if (DEBUGlevel>=2)
-		    printf("\t[%d] k=%d, look_id=%d, recv_req[1] == MPI_REQUEST_NULL, msgcnt[1] = %d\n", 
+		    printf("\t[%d] k=%d, look_id=%d, recv_req[1] == MPI_REQUEST_NULL, msgcnt[1] = %d\n",
 			   iam, k, look_id, msgcnt[1]);
 #endif
                 }
@@ -1467,14 +1468,14 @@ pdgstrf(superlu_dist_options_t * options
             if (factoredU[k0] == -1) {
                 /* Parallel triangular solve across process row *krow* --
                    U(k,j) = L(k,k) \ A(k,j).  */
-                 double ttt2 = SuperLU_timer_(); 
+                 double ttt2 = SuperLU_timer_();
 #ifdef _OPENMP
 /* #pragma omp parallel */ /* Sherry -- parallel done inside pdgstrs2 */
 #endif
                 {
                     PDGSTRS2 (k0, k, Glu_persist, grid, Llu, stat);
                 }
-                pdgstrs2_timer += SuperLU_timer_() - ttt2; 
+                pdgstrs2_timer += SuperLU_timer_() - ttt2;
 
 	        /* Sherry -- need to set factoredU[k0] = 1; ?? */
 
@@ -1496,7 +1497,7 @@ pdgstrf(superlu_dist_options_t * options
                                       SLU_MPI_TAG (2, k0), /* (4*k0+2)%tag_ub */
                                       scp->comm);
                             MPI_Send (uval, msgcnt[3], MPI_DOUBLE, pi,
-                                      SLU_MPI_TAG (3, k0), /* (4*k0+3)%tag_ub */ 
+                                      SLU_MPI_TAG (3, k0), /* (4*k0+3)%tag_ub */
                                       scp->comm);
 #if ( PROFlevel>=1 )
                             TOC (t2, t1);
@@ -1624,9 +1625,9 @@ pdgstrf(superlu_dist_options_t * options
             }
             iukp = iukp0;
 #ifdef ISORT
-            /* iperm_u is sorted based on elimination order; 
+            /* iperm_u is sorted based on elimination order;
                perm_u reorders the U blocks to match the elimination order. */
-            isort (nub, iperm_u, perm_u); 
+            isort (nub, iperm_u, perm_u);
 #else
             qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
                    &superlu_sort_perm);
@@ -1686,11 +1687,11 @@ pdgstrf(superlu_dist_options_t * options
                         /* Factor diagonal and subdiagonal blocks and
 			   test for exact singularity.  */
                         factored[kk] = 0; /* flag column kk as factored */
-                        double ttt1 = SuperLU_timer_(); 
+                        double ttt1 = SuperLU_timer_();
                         PDGSTRF2 (options, kk0, kk, thresh,
                                   Glu_persist, grid, Llu, U_diag_blk_send_req,
                                   tag_ub, stat, info);
-                        pdgstrf2_timer += SuperLU_timer_() - ttt1; 
+                        pdgstrf2_timer += SuperLU_timer_() - ttt1;
 
                         /* Process column *kcol+1* multicasts numeric
 			   values of L(:,k+1) to process rows. */
@@ -1739,18 +1740,18 @@ pdgstrf(superlu_dist_options_t * options
 
 #include "dSchCompUdt-cuda.c"
 
-#else 
+#else
 
 /*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/
 //#include "dSchCompUdt-2Ddynamic_v6.c"
 
 #include "dSchCompUdt-2Ddynamic.c"
 
-#endif 
+#endif
 	/*uncomment following to compare against SuperLU 3.3 baseline*/
         /* #include "SchCompUdt--baseline.c"  */
 	/************************************************************************/
-        
+
         NetSchurUpTimer += SuperLU_timer_() - tsch;
 
     }  /* MAIN LOOP for k0 = 0, ... */
@@ -1758,7 +1759,7 @@ pdgstrf(superlu_dist_options_t * options
     /* ##################################################################
        ** END MAIN LOOP: for k0 = ...
        ################################################################## */
-    
+
     pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer;
 
 #if ( PRNTlevel>=2 )
@@ -1779,13 +1780,13 @@ pdgstrf(superlu_dist_options_t * options
         printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer);
         printf(".. Time to Gather L buffer\t %8.2lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
         printf(".. Time to Gather U buffer\t %8.2lf \n", GatherUTimer);
-	       
+
         printf(".. Time in GEMM %8.2lf \n",
 	       LookAheadGEMMTimer + RemainGEMMTimer);
         printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer);
-        printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n", 
+        printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n",
 	       RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9);
-        printf(".. Time to Scatter %8.2lf \n", 
+        printf(".. Time to Scatter %8.2lf \n",
 	       LookAheadScatterTimer + RemainScatterTimer);
         printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer);
         printf("\t* Remain\t %8.2lf \n", RemainScatterTimer);
@@ -1795,7 +1796,7 @@ pdgstrf(superlu_dist_options_t * options
 	printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
     }
 #endif
-    
+
 #if ( DEBUGlevel>=3 )
     for (i = 0; i < Pr * Pc; ++i) {
         if (iam == i) {
@@ -1832,7 +1833,7 @@ pdgstrf(superlu_dist_options_t * options
     log_memory( -((Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) * iword +
 		  (Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) * dword),
 		stat );
-    
+
     SUPERLU_FREE (Lsub_buf_2);
     SUPERLU_FREE (Lval_buf_2);
     SUPERLU_FREE (Usub_buf_2);
@@ -1914,7 +1915,7 @@ pdgstrf(superlu_dist_options_t * options
     SUPERLU_FREE(Remain_info);
     SUPERLU_FREE(lookAhead_L_buff);
     SUPERLU_FREE(Remain_L_buff);
-    log_memory( -(3 * mrb * iword + mrb * sizeof(Remain_info_t) + 
+    log_memory( -(3 * mrb * iword + mrb * sizeof(Remain_info_t) +
 		  ldt * ldt * (num_look_aheads + 1) * dword +
 		  Llu->bufmax[1] * dword), stat );
 
@@ -1966,7 +1967,7 @@ pdgstrf(superlu_dist_options_t * options
 	    for (i = 0; i < gemm_count; ++i)
 		fprintf(fgemm, "%8d%8d%8d\t %20.16e\t%8d\n", gemm_stats[i].m, gemm_stats[i].n,
 			gemm_stats[i].k, gemm_stats[i].microseconds, prof_sendR[i]);
-	    
+
 	    fclose(fgemm);
         }
 	SUPERLU_FREE(gemm_stats);
diff -pruN 6.1.0+dfsg1-1/SRC/pdgstrs1.c 6.1.1+dfsg1-1/SRC/pdgstrs1.c
--- 6.1.0+dfsg1-1/SRC/pdgstrs1.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgstrs1.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Solves a system of distributed linear equations
  *
  * <pre>
@@ -35,7 +35,7 @@ at the top-level directory.
 #ifdef _CRAY
 fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
 		   double*, int*, double*, int*);
-fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*,
 		   int*, double*, int*, double*, double*, int*);
 _fcd ftcs1;
 _fcd ftcs2;
@@ -59,7 +59,7 @@ _fcd ftcs3;
  * This routine is used only in the iterative refinement routine
  * pdgsrfs_ABXglobal, assuming that the right-hand side is already
  * distributed in the diagonal processes.
- * 
+ *
  * Arguments
  * =========
  *
@@ -85,13 +85,13 @@ _fcd ftcs3;
  *        Number of right-hand sides.
  *
  * stat   (output) SuperLUStat_t*
- *        Record the statistics about the triangular solves; 
+ *        Record the statistics about the triangular solves;
  *        See SuperLUStat_t structure defined in util.h.
  *
  * info   (output) int*
  * 	   = 0: successful exit
  *	   < 0: if info = -i, the i-th argument had an illegal value
- * </pre>      
+ * </pre>
  */
 
 void pdgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
@@ -157,7 +157,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 	pxerr_dist("PDGSTRS1", grid, -*info);
 	return;
     }
-	
+
     /*
      * Initialization.
      */
@@ -205,7 +205,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 
     /* Allocate working storage. */
     knsupc = sp_ienv_dist(3);
-    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs 
+    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs
         + nlb * LSUM_H)) )
 	ABORT("Calloc fails for lsum[].");
     maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H);
@@ -214,7 +214,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
     if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )
 	ABORT("Malloc fails for rtemp[].");
 
-    
+
     /*---------------------------------------------------
      * Forward solve Ly = b.
      *---------------------------------------------------*/
@@ -228,7 +228,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 	if ( myrow == krow ) {
 	    lk = LBi( k, grid );   /* Local block number. */
 	    il = LSUM_BLK( lk );
-	    lsum[il - LSUM_H] = k; 
+	    lsum[il - LSUM_H] = k;
 	}
     }
 
@@ -250,7 +250,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 	    }
 	}
 	/*PrintInt10("mod_bit", nlb, mod_bit);*/
-	
+
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
 	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
@@ -315,10 +315,10 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 		      lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		/*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
@@ -326,7 +326,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 #if ( DEBUGlevel>=2 )
 		printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
-		
+
 		/*
 		 * Send Xk to process column Pc[k].
 		 */
@@ -339,7 +339,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
                                    &send_req[Llu->SolveMsgSent++]);
 #else
 			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 MPI_DOUBLE, 
+				 MPI_DOUBLE,
                                  pi, Xk, grid->comm );
 #endif
 #if ( DEBUGlevel>=2 )
@@ -347,14 +347,14 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 			       iam, x[ii-XK_H], pi);
 #endif
 		    }
-		
+
 		/*
 		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
 		 */
 		nb = lsub[0] - 1;
 		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
 		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
+
 		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
 			   fmod, nb, lptr, luptr, xsup, grid, Llu,
 			   send_req, stat);
@@ -388,7 +388,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 #if ( DEBUGlevel>=2 )
 	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
 #endif
-	
+
 	switch ( status.MPI_TAG ) {
 	  case Xk:
 	      --nfrecvx;
@@ -431,17 +431,17 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 			lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 			 lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		  /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
 #if ( DEBUGlevel>=2 )
 		  printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
-		
+
 		  /*
 		   * Send Xk to process column Pc[k].
 		   */
@@ -623,7 +623,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
     if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
         ABORT("Malloc fails for Ucb_valptr[]");
 
-    /* Count number of row blocks in a block column. 
+    /* Count number of row blocks in a block column.
        One pass of the skeleton graph of U. */
     for (lk = 0; lk < nlb; ++lk) {
 	usub = Ufstnz_br_ptr[lk];
@@ -680,7 +680,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 		    for (i = 0; i < Urbs[lb]; ++i)
 			printf("(%2d) .. row blk %2d:\
                                lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
+			       iam, i,
 			       Ucb_indptr[lb][i].lbnum,
 			       Ucb_indptr[lb][i].indpos,
 			       Ucb_valptr[lb][i]);
@@ -733,10 +733,10 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 		      lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		/*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
@@ -763,11 +763,11 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 			       iam, x[ii-XK_H], pi);
 #endif
 		    }
-		
+
 		/*
 		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
 		 */
-		if ( Urbs[lk] ) 
+		if ( Urbs[lk] )
 		    dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
 			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 			       send_req, stat);
@@ -782,7 +782,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
     while ( nbrecvx || nbrecvmod ) { /* While not finished. */
 
 	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, 
+	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE,
                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
 	k = *recvbuf;
 
@@ -824,10 +824,10 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 			  lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		    /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
@@ -854,9 +854,9 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 				   iam, x[ii - XK_H], pi);
 #endif
 			}
-		
+
 		    /*
-		     * Perform local block modifications: 
+		     * Perform local block modifications:
 		     *         lsum[i] -= U_i,k * X[k]
 		     */
 		    if ( Urbs[lk] )
@@ -864,14 +864,14 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
 				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 				   send_req, stat);
 		} /* if becomes solvable */
-		
+
 		break;
 
 #if ( DEBUGlevel>=2 )
 	      default:
 		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
 		break;
-#endif		
+#endif
 
 	} /* switch */
 
@@ -902,7 +902,7 @@ void pdgstrs1(int_t n, LUstruct_t *LUstr
     for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
     SUPERLU_FREE(send_req);
 #endif
-    
+
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Exit pdgstrs1()");
 #endif
diff -pruN 6.1.0+dfsg1-1/SRC/pdgstrs_Bglobal.c 6.1.1+dfsg1-1/SRC/pdgstrs_Bglobal.c
--- 6.1.0+dfsg1-1/SRC/pdgstrs_Bglobal.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgstrs_Bglobal.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Solves a system of distributed linear equations A*X = B with a general N-by-N matrix A using the LU factorization
  *
  * <pre>
@@ -35,7 +35,7 @@ at the top-level directory.
 #ifdef _CRAY
 fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
 		   double*, int*, double*, int*);
-fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*,
 		   int*, double*, int*, double*, double*, int*);
 _fcd ftcs1;
 _fcd ftcs2;
@@ -54,7 +54,7 @@ static void gather_diag_to_all(int_t, in
  * pdgstrs_Bglobal solves a system of distributed linear equations
  * A*X = B with a general N-by-N matrix A using the LU factorization
  * computed by pdgstrf.
- * 
+ *
  * Arguments
  * =========
  *
@@ -81,7 +81,7 @@ static void gather_diag_to_all(int_t, in
  *        On exit, the solution matrix of the possibly equilibrated
  *        and row permuted system if info = 0;
  *
- *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all 
+ *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all
  *              processes when calling this routine.
  *
  * ldb    (input) int (global)
@@ -97,12 +97,12 @@ static void gather_diag_to_all(int_t, in
  * info   (output) int*
  * 	   = 0: successful exit
  *	   < 0: if info = -i, the i-th argument had an illegal value
- * </pre>    
+ * </pre>
  */
 
 void
-pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, 
-                double *B, int_t ldb, int nrhs, 
+pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
+                double *B, int_t ldb, int nrhs,
                 SuperLUStat_t *stat, int *info)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
@@ -155,7 +155,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 #endif
 
     int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
- 
+
     t = SuperLU_timer_();
 
     /* Test input parameters. */
@@ -166,7 +166,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 	pxerr_dist("PDGSTRS_BGLOBAL", grid, -*info);
 	return;
     }
-	
+
     /*
      * Initialization.
      */
@@ -216,10 +216,10 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
     /* Allocate working storage. */
     knsupc = sp_ienv_dist(3);
     maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
-    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs 
+    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs
         + nlb * LSUM_H)) )
 	ABORT("Calloc fails for lsum[].");
-    if ( !(x = doubleMalloc_dist(((size_t)ldalsum) * nrhs 
+    if ( !(x = doubleMalloc_dist(((size_t)ldalsum) * nrhs
         + nlb * XK_H)) )
 	ABORT("Malloc fails for x[].");
     if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
@@ -227,7 +227,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
     if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )
 	ABORT("Malloc fails for rtemp[].");
 
-    
+
     /*---------------------------------------------------
      * Forward solve Ly = b.
      *---------------------------------------------------*/
@@ -272,7 +272,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
 	    }
 	}
-	
+
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
 	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
@@ -337,10 +337,10 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 		      lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
@@ -348,7 +348,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 #if ( DEBUGlevel>=2 )
 		printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
-		
+
 		/*
 		 * Send Xk to process column Pc[k].
 		 */
@@ -366,7 +366,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 #else
 
 			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				  MPI_DOUBLE, 
+				  MPI_DOUBLE,
                                   pi, Xk, grid->comm );
 #endif
 #endif
@@ -382,9 +382,9 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		nb = lsub[0] - 1;
 		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
 		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
+
 		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			   fmod, nb, lptr, luptr, xsup, grid, Llu,
 			   send_req,stat);
 	    }
 	} /* if diagonal process ... */
@@ -418,7 +418,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 #if ( DEBUGlevel>=2 )
 	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
 #endif
-	
+
 	switch ( status.MPI_TAG ) {
 	  case Xk:
 	      --nfrecvx;
@@ -435,7 +435,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
 		   */
 		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
 			     send_req, stat);
 	      } /* if lsub */
 
@@ -461,10 +461,10 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 			lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 			 lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
@@ -472,7 +472,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 #if ( DEBUGlevel>=2 )
 		  printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
-		
+
 		  /*
 		   * Send Xk to process column Pc[k].
 		   */
@@ -482,7 +482,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 			  pi = PNUM( p, kcol, grid );
 #ifdef ISEND_IRECV
 			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				     MPI_DOUBLE, pi, Xk, grid->comm, 
+				     MPI_DOUBLE, pi, Xk, grid->comm,
 				     &send_req[Llu->SolveMsgSent++]);
 #else
 #ifdef BSEND
@@ -513,7 +513,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 
 	      break;
 
-#if ( DEBUGlevel>=2 )	      
+#if ( DEBUGlevel>=2 )
 	    default:
 	      printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
 	      break;
@@ -659,7 +659,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
     if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
         ABORT("Malloc fails for Ucb_valptr[]");
 
-    /* Count number of row blocks in a block column. 
+    /* Count number of row blocks in a block column.
        One pass of the skeleton graph of U. */
     for (lk = 0; lk < nlb; ++lk) {
 	usub = Ufstnz_br_ptr[lk];
@@ -717,7 +717,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		    for (i = 0; i < Urbs[lb]; ++i)
 			printf("(%2d) .. row blk %2d:\
                                lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
+			       iam, i,
 			       Ucb_indptr[lb][i].lbnum,
 			       Ucb_indptr[lb][i].indpos,
 			       Ucb_valptr[lb][i]);
@@ -770,10 +770,10 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 		      lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
@@ -809,7 +809,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		/*
 		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
 		 */
-		if ( Urbs[lk] ) 
+		if ( Urbs[lk] )
 		    dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
 			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 			       send_req, stat);
@@ -826,7 +826,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 	/* Receive a message. */
 	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
 		 MPI_ANY_TAG, grid->comm, &status );
-	
+
 	k = *recvbuf;
 
 #if ( DEBUGlevel>=2 )
@@ -842,7 +842,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		 *         lsum[i] -= U_i,k * X[k]
 		 */
 		dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
-			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 			   send_req, stat);
 
 	        break;
@@ -867,10 +867,10 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 			  lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		    stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
@@ -904,7 +904,7 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 			}
 		    }
 		    /*
-		     * Perform local block modifications: 
+		     * Perform local block modifications:
 		     *         lsum[i] -= U_i,k * X[k]
 		     */
 		    if ( Urbs[lk] )
@@ -912,14 +912,14 @@ pdgstrs_Bglobal(int_t n, LUstruct_t *LUs
 				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 				   send_req, stat);
 		} /* if becomes solvable */
-		
+
 		break;
 
 #if ( DEBUGlevel>=2 )
 	      default:
 		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
 		break;
-#endif		
+#endif
 
 	} /* switch */
 
@@ -997,7 +997,7 @@ gather_diag_to_all(int_t n, int_t nrhs,
     int_t *ilsum, *xsup;
     int iam, knsupc, pkk;
     double *x_col, *y_col;
-    
+
     iam = grid->iam;
     nsupers = Glu_persist->supno[n-1] + 1;
     xsup = Glu_persist->xsup;
diff -pruN 6.1.0+dfsg1-1/SRC/pdgstrs.c 6.1.1+dfsg1-1/SRC/pdgstrs.c
--- 6.1.0+dfsg1-1/SRC/pdgstrs.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgstrs.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,24 +1,25 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Solves a system of distributed linear equations A*X = B with a
  * general N-by-N matrix A using the LU factors computed previously.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.0) --
+ * -- Distributed SuperLU routine (version 6.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 15, 2008
  * September 18, 2018  version 6.0
+ * February 8, 2019  version 6.1.1
  * </pre>
  */
 #include <math.h>
@@ -36,13 +37,13 @@ at the top-level directory.
  *   while ( not finished ) { .. use message counter to control
  *
  *      reveive a message;
- * 	
+ *
  * 	if ( message is Xk ) {
  * 	    perform local block modifications into lsum[];
  *                 lsum[i] -= L_i,k * X[k]
  *          if all local updates done, Isend lsum[] to diagonal process;
  *
- *      } else if ( message is LSUM ) { .. this must be a diagonal process 
+ *      } else if ( message is LSUM ) { .. this must be a diagonal process
  *          accumulate LSUM;
  *          if ( all LSUM are received ) {
  *              perform triangular solve for Xi;
@@ -52,7 +53,7 @@ at the top-level directory.
  *      }
  *   }
  *
- * 
+ *
  * Auxiliary data structures: lsum[] / ilsum (pointer to lsum array)
  * =======================
  *
@@ -67,7 +68,7 @@ at the top-level directory.
  *         | | |  <- header of size 2     ---
  *         --------- <--------------------| |
  *         | | | | |			  ---
- * 	   | | | | |	      |-----------| |		
+ * 	   | | | | |	      |-----------| |
  *         | | | | | 	      |           ---
  *	   ---------          |   |-------| |
  *         | | |  <- header   |   |       ---
@@ -83,7 +84,7 @@ at the top-level directory.
  *         | | | | |                 |
  *	   --------- <---------------|
  */
-  
+
 /*#define ISEND_IRECV*/
 
 /*
@@ -103,7 +104,7 @@ _fcd ftcs3;
  * Purpose
  * =======
  *   Re-distribute B on the diagonal processes of the 2D process mesh.
- * 
+ *
  * Note
  * ====
  *   This routine can only be called after the routine pxgstrs_init(),
@@ -111,7 +112,7 @@ _fcd ftcs3;
  *
  * Arguments
  * =========
- * 
+ *
  * B      (input) double*
  *        The distributed right-hand side matrix of the possibly
  *        equilibrated system.
@@ -198,8 +199,8 @@ pdReDistribute_B_to_X(double *B, int_t m
        NOW COMMUNICATE THE ACTUAL DATA.
        ------------------------------------------------------------*/
 
-	if(procs==1){ // faster memory copy when procs=1 
-	
+	if(procs==1){ // faster memory copy when procs=1
+
 #ifdef _OPENMP
 #pragma omp parallel default (shared)
 #endif
@@ -207,20 +208,20 @@ pdReDistribute_B_to_X(double *B, int_t m
 #ifdef _OPENMP
 #pragma omp master
 #endif
-	{	
+	{
 		// t = SuperLU_timer_();
 #ifdef _OPENMP
-#pragma	omp	taskloop private (i,l,irow,k,j,knsupc) untied 
+#pragma	omp	taskloop private (i,l,irow,k,j,knsupc) untied
 #endif
 		for (i = 0; i < m_loc; ++i) {
 			irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*B */
-	   
+
 			k = BlockNum( irow );
 			knsupc = SuperSize( k );
 			l = X_BLK( k );
-			
+
 			x[l - XK_H] = k;      /* Block number prepended in the header. */
-			
+
 			irow = irow - FstBlockC(k); /* Relative row number in X-block */
 			RHS_ITERATE(j) {
 			x[l + irow + j*knsupc] = B[i + j*ldb];
@@ -238,19 +239,19 @@ pdReDistribute_B_to_X(double *B, int_t m
 			ABORT("Malloc fails for send_dbuf[].");
 		recv_dbuf = send_dbuf + k * nrhs;
 		if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
-			ABORT("Malloc fails for req_send[].");	
+			ABORT("Malloc fails for req_send[].");
 		if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
 			ABORT("Malloc fails for req_recv[].");
 		if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
 			ABORT("Malloc fails for status_send[].");
 		if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
 			ABORT("Malloc fails for status_recv[].");
-		
+
 		for (p = 0; p < procs; ++p) {
 			ptr_to_ibuf[p] = sdispls[p];
 			ptr_to_dbuf[p] = sdispls[p] * nrhs;
 		}
-		
+
 		/* Copy the row indices and values to the send buffer. */
 		// t = SuperLU_timer_();
 		for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
@@ -260,18 +261,18 @@ pdReDistribute_B_to_X(double *B, int_t m
 		k = ptr_to_ibuf[p];
 		send_ibuf[k] = irow;
 		++ptr_to_ibuf[p];
-		
+
 		k = ptr_to_dbuf[p];
 		RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
 			send_dbuf[k++] = B[i + j*ldb];
 		}
 		ptr_to_dbuf[p] += nrhs;
 		}
-		
+
 		// t = SuperLU_timer_() - t;
-		// printf(".. copy to send buffer time\t%8.4f\n", t);	
+		// printf(".. copy to send buffer time\t%8.4f\n", t);
 
-#if 0	
+#if 0
 	#if 1
 		/* Communicate the (permuted) row indices. */
 		MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
@@ -280,17 +281,17 @@ pdReDistribute_B_to_X(double *B, int_t m
 		MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE,
 			  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE,
 			  grid->comm);
-	#else	
+	#else
  		/* Communicate the (permuted) row indices. */
 		MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
 				recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm, &req_i);
  		/* Communicate the numerical values. */
 		MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE,
 				recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE,
-				grid->comm, &req_d);	
+				grid->comm, &req_d);
 		MPI_Wait(&req_i,&status);
 		MPI_Wait(&req_d,&status);
- 	#endif	 
+ 	#endif
 #endif
 	MPI_Barrier( grid->comm );
 
@@ -304,7 +305,7 @@ pdReDistribute_B_to_X(double *B, int_t m
 		ppr = grid->iam-1+pp;
 		if(ppr>=procs)ppr-=procs;
 		if(ppr<0)ppr+=procs;
-		
+
 		if(SendCnt[pps]>0){
 			MPI_Isend(&send_ibuf[sdispls[pps]], SendCnt[pps], mpi_int_t, pps, 0, grid->comm,
 			&req_send[Nreq_send] );
@@ -314,7 +315,7 @@ pdReDistribute_B_to_X(double *B, int_t m
 			MPI_Irecv(&recv_ibuf[rdispls[ppr]], RecvCnt[ppr], mpi_int_t, ppr, 0, grid->comm,
 			&req_recv[Nreq_recv] );
 			Nreq_recv++;
-		}		
+		}
 	}
 
 
@@ -323,7 +324,7 @@ pdReDistribute_B_to_X(double *B, int_t m
 
 
 	Nreq_send=0;
-	Nreq_recv=0;	
+	Nreq_recv=0;
 	for (pp=0;pp<procs;pp++){
 		pps = grid->iam+1+pp;
 		if(pps>=procs)pps-=procs;
@@ -340,17 +341,17 @@ pdReDistribute_B_to_X(double *B, int_t m
 			MPI_Irecv(&recv_dbuf[rdispls_nrhs[ppr]], RecvCnt_nrhs[ppr], MPI_DOUBLE, ppr, 1, grid->comm,
 			&req_recv[Nreq_recv] );
 			Nreq_recv++;
-		}		
+		}
 	}
 
 	if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send);
 	if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv);
 
-	
+
 		/* ------------------------------------------------------------
 		   Copy buffer into X on the diagonal processes.
 		   ------------------------------------------------------------*/
-		
+
 		// t = SuperLU_timer_();
 		ii = 0;
 		for (p = 0; p < procs; ++p) {
@@ -364,7 +365,7 @@ pdReDistribute_B_to_X(double *B, int_t m
 			lk = LBi( k, grid );  /* Local block number. */
 			l = X_BLK( lk );
 			x[l - XK_H] = k;      /* Block number prepended in the header. */
-			
+
 			irow = irow - FstBlockC(k); /* Relative row number in X-block */
 			RHS_ITERATE(j) {
 				x[l + irow + j*knsupc] = recv_dbuf[jj++];
@@ -374,17 +375,17 @@ pdReDistribute_B_to_X(double *B, int_t m
 		}
 
 		// t = SuperLU_timer_() - t;
-		// printf(".. copy to x time\t%8.4f\n", t);	
-		
+		// printf(".. copy to x time\t%8.4f\n", t);
+
 		SUPERLU_FREE(send_ibuf);
 		SUPERLU_FREE(send_dbuf);
 		SUPERLU_FREE(req_send);
 		SUPERLU_FREE(req_recv);
 		SUPERLU_FREE(status_send);
-		SUPERLU_FREE(status_recv);	
-	}  
+		SUPERLU_FREE(status_recv);
+	}
+
 
-    
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(grid->iam, "Exit pdReDistribute_B_to_X()");
 #endif
@@ -427,7 +428,7 @@ pdReDistribute_X_to_B(int_t n, double *B
 	MPI_Request req_i, req_d, *req_send, *req_recv;
 	MPI_Status status, *status_send, *status_recv;
 	int Nreq_recv, Nreq_send, pp,pps,ppr;
-	
+
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(grid->iam, "Enter pdReDistribute_X_to_B()");
 #endif
@@ -440,7 +441,7 @@ pdReDistribute_X_to_B(int_t n, double *B
     nsupers = Glu_persist->supno[n-1] + 1;
     iam = grid->iam;
     procs = grid->nprow * grid->npcol;
- 
+
     SendCnt      = gstrs_comm->X_to_B_SendCnt;
     SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt +   procs;
     RecvCnt      = gstrs_comm->X_to_B_SendCnt + 2*procs;
@@ -452,9 +453,9 @@ pdReDistribute_X_to_B(int_t n, double *B
     ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
     ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
 
-	
+
 	if(procs==1){ //faster memory copy when procs=1
-		
+
 #ifdef _OPENMP
 #pragma omp parallel default (shared)
 #endif
@@ -462,12 +463,12 @@ pdReDistribute_X_to_B(int_t n, double *B
 #ifdef _OPENMP
 #pragma omp master
 #endif
-	{	
+	{
 		// t = SuperLU_timer_();
 #ifdef _OPENMP
-#pragma	omp	taskloop private (k,knsupc,lk,irow,l,i,j) untied 
-#endif		
-		for (k = 0; k < nsupers; k++) { 
+#pragma	omp	taskloop private (k,knsupc,lk,irow,l,i,j) untied
+#endif
+		for (k = 0; k < nsupers; k++) {
 		knsupc = SuperSize( k );
 		lk = LBi( k, grid ); /* Local block number */
 		irow = FstBlockC( k );
@@ -479,7 +480,7 @@ pdReDistribute_X_to_B(int_t n, double *B
 			}
 		}
 	}
-	}	
+	}
 	}else{
 		k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
 		l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
@@ -489,13 +490,13 @@ pdReDistribute_X_to_B(int_t n, double *B
 		if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) )
 			ABORT("Malloc fails for send_dbuf[].");
 		if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
-			ABORT("Malloc fails for req_send[].");	
+			ABORT("Malloc fails for req_send[].");
 		if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
 			ABORT("Malloc fails for req_recv[].");
 		if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
 			ABORT("Malloc fails for status_send[].");
 		if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
-			ABORT("Malloc fails for status_recv[].");	    
+			ABORT("Malloc fails for status_recv[].");
 		recv_dbuf = send_dbuf + k * nrhs;
 		for (p = 0; p < procs; ++p) {
 			ptr_to_ibuf[p] = sdispls[p];
@@ -531,26 +532,26 @@ pdReDistribute_X_to_B(int_t n, double *B
 			}
 		}
 		}
-		
+
 		/* ------------------------------------------------------------
 			COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES.
 		   ------------------------------------------------------------*/
-#if 0	
+#if 0
 	#if 1
 		MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
 			  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
-		MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs,MPI_DOUBLE, 
+		MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs,MPI_DOUBLE,
 			  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE,
 			  grid->comm);
 	#else
 		MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
 				recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm,&req_i);
-		MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, 
+		MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE,
 				recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE,
 				grid->comm,&req_d);
  		MPI_Wait(&req_i,&status);
-		MPI_Wait(&req_d,&status);		 
-	#endif	
+		MPI_Wait(&req_d,&status);
+	#endif
 #endif
 
 	MPI_Barrier( grid->comm );
@@ -572,7 +573,7 @@ pdReDistribute_X_to_B(int_t n, double *B
 			MPI_Irecv(&recv_ibuf[rdispls[ppr]], RecvCnt[ppr], mpi_int_t, ppr, 0, grid->comm,
 			&req_recv[Nreq_recv] );
 			Nreq_recv++;
-		}	
+		}
 	}
 
 
@@ -598,15 +599,15 @@ pdReDistribute_X_to_B(int_t n, double *B
 			MPI_Irecv(&recv_dbuf[rdispls_nrhs[ppr]], RecvCnt_nrhs[ppr], MPI_DOUBLE, ppr, 1, grid->comm,
 			&req_recv[Nreq_recv] );
 			Nreq_recv++;
-		}	
+		}
 	}
 
 
 	if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send);
 	if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv);
 	// MPI_Barrier( grid->comm );
-		
-	
+
+
 		/* ------------------------------------------------------------
 		   COPY THE BUFFER INTO B.
 		   ------------------------------------------------------------*/
@@ -623,7 +624,7 @@ pdReDistribute_X_to_B(int_t n, double *B
 	SUPERLU_FREE(req_send);
 	SUPERLU_FREE(req_recv);
 	SUPERLU_FREE(status_send);
-	SUPERLU_FREE(status_recv);	
+	SUPERLU_FREE(status_recv);
 }
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(grid->iam, "Exit pdReDistribute_X_to_B()");
@@ -675,18 +676,18 @@ pdCompute_Diag_Inv(int_t n, LUstruct_t *
 
     double one = 1.0;
     double zero = 0.0;
-	
+
 #if ( PROFlevel>=1 )
     t = SuperLU_timer_();
-#endif 
+#endif
 
-#if ( PRNTlevel>=1 )
+#if ( PRNTlevel>=2 )
     if ( grid->iam==0 ) {
 	printf("computing inverse of diagonal blocks...\n");
 	fflush(stdout);
     }
 #endif
-	
+
     /*
      * Initialization.
      */
@@ -703,7 +704,7 @@ pdCompute_Diag_Inv(int_t n, LUstruct_t *
     Uinv_bc_ptr = Llu->Uinv_bc_ptr;
     Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
     nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
-    
+
     Llu->inv = 1;
 
     /*---------------------------------------------------
@@ -722,23 +723,23 @@ pdCompute_Diag_Inv(int_t n, LUstruct_t *
 		  lusup = Lnzval_bc_ptr[lk];
 		  Linv = Linv_bc_ptr[lk];
 		  Uinv = Uinv_bc_ptr[lk];
-		  nsupr = lsub[1];	
+		  nsupr = lsub[1];
 		  knsupc = SuperSize( k );
 
 		  for (j=0 ; j<knsupc; j++){
 		      for (i=0 ; i<knsupc; i++){
-		  	  Linv[j*knsupc+i] = zero;	
-			  Uinv[j*knsupc+i] = zero;	
+		  	  Linv[j*knsupc+i] = zero;
+			  Uinv[j*knsupc+i] = zero;
 		      }
 	          }
-				
+
 	   	  for (j=0 ; j<knsupc; j++){
 		      Linv[j*knsupc+j] = one;
 		      for (i=j+1 ; i<knsupc; i++){
-		          Linv[j*knsupc+i] = lusup[j*nsupr+i];	
+		          Linv[j*knsupc+i] = lusup[j*nsupr+i];
 		      }
 		      for (i=0 ; i<j+1; i++){
-			  Uinv[j*knsupc+i] = lusup[j*nsupr+i];	
+			  Uinv[j*knsupc+i] = lusup[j*nsupr+i];
 	              }
  		  }
 
@@ -757,7 +758,7 @@ pdCompute_Diag_Inv(int_t n, LUstruct_t *
 	printf(".. L-diag_inv time\t%10.5f\n", t);
 	fflush(stdout);
     }
-#endif	
+#endif
 
     return;
 #endif /* SLU_HAVE_LAPACK */
@@ -779,7 +780,7 @@ pdCompute_Diag_Inv(int_t n, LUstruct_t *
  * and the linear system solved is
  *     A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and
  * the permutation to B1 by Pc*Pr is applied internally in this routine.
- * 
+ *
  * Arguments
  * =========
  *
@@ -820,7 +821,7 @@ pdCompute_Diag_Inv(int_t n, LUstruct_t *
  *
  * nrhs   (input) int (global)
  *        Number of right-hand sides.
- * 
+ *
  * SOLVEstruct (input) SOLVEstruct_t* (global)
  *        Contains the information for the communication during the
  *        solution phase.
@@ -832,11 +833,11 @@ pdCompute_Diag_Inv(int_t n, LUstruct_t *
  * info   (output) int*
  * 	   = 0: successful exit
  *	   < 0: if info = -i, the i-th argument had an illegal value
- * </pre>       
+ * </pre>
  */
 
 void
-pdgstrs(int_t n, LUstruct_t *LUstruct, 
+pdgstrs(int_t n, LUstruct_t *LUstruct,
 	ScalePermstruct_t *ScalePermstruct,
 	gridinfo_t *grid, double *B,
 	int_t m_loc, int_t fst_row, int_t ldb, int nrhs,
@@ -846,7 +847,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
     double alpha = 1.0;
-	double beta = 0.0;	
+	double beta = 0.0;
     double zero = 0.0;
     double *lsum;  /* Local running sum of the updates to B-components */
     double *x;     /* X component at step k. */
@@ -857,7 +858,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     double *rtemp, *rtemp_loc; /* Result of full matrix-vector multiply. */
     double *Linv; /* Inverse of diagonal block */
     double *Uinv; /* Inverse of diagonal block */
-    int *ipiv; 
+    int *ipiv;
     int_t *leaf_send;
     int_t nleaf_send, nleaf_send_tmp;
     int_t *root_send;
@@ -867,8 +868,8 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     BcTree  *LBtree_ptr = Llu->LBtree_ptr;
     RdTree  *LRtree_ptr = Llu->LRtree_ptr;
     BcTree  *UBtree_ptr = Llu->UBtree_ptr;
-    RdTree  *URtree_ptr = Llu->URtree_ptr;	
-    int_t  *Urbs1, *Urbs2; /* Number of row blocks in each block column of U. */
+    RdTree  *URtree_ptr = Llu->URtree_ptr;
+    int_t  *Urbs1; /* Number of row blocks in each block column of U. */
     int_t  *Urbs = Llu->Urbs; /* Number of row blocks in each block column of U. */
     Ucb_indptr_t **Ucb_indptr = Llu->Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
     int_t  **Ucb_valptr = Llu->Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
@@ -888,7 +889,6 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     double **Uinv_bc_ptr;
     double sum;
     MPI_Status status,status_on,statusx,statuslsum;
-    MPI_Request *send_req, recv_req, req;
     pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
     SuperLUStat_t **stat_loc;
 
@@ -900,9 +900,9 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     int_t fmod_tmp;
     int_t  **fsendx_plist = Llu->fsendx_plist;
     int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  nfrecvx_buf=0;						 	    			 
+    int_t  nfrecvx_buf=0;
     int_t  *frecv;        /* Count of lsum[lk] contributions to be received
-    			 from processes in this row. 
+    			 from processes in this row.
     			 It is only valid on the diagonal processes. */
     int_t  frecv_tmp;
     int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
@@ -916,12 +916,12 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     int_t  bmod_tmp;
     int_t  **bsendx_plist = Llu->bsendx_plist;
     int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  nbrecvx_buf=0;		
+    int_t  nbrecvx_buf=0;
     int_t  *brecv;        /* Count of modifications to be recv'd from
     			 processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     int_t flagx,flaglsum,flag;
-    int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups, *rootsups; 
+    int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups, *rootsups;
     int_t TAG;
     double t1_sol, t2_sol, t;
 #if ( DEBUGlevel>=2 )
@@ -929,7 +929,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
 #endif
 
     int_t gik,iklrow,fnz;
-    
+
     int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
     int INFO, pad;
     int_t tmpresult;
@@ -937,7 +937,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     // #if ( PROFlevel>=1 )
     double t1, t2;
     float msg_vol = 0, msg_cnt = 0;
-    // #endif 
+    // #endif
 
     int_t msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
 		      *     0 : transferred in Lsub_buf[]
@@ -946,14 +946,14 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
 		      *     3 : transferred in Uval_buf[]
 		      */
     int iword = sizeof (int_t);
-    int dword = sizeof (double);	
+    int dword = sizeof (double);
     int Nwork;
 	int_t procs = grid->nprow * grid->npcol;
     	yes_no_t done;
     yes_no_t startforward;
     	int nbrow;
     int_t  ik, rel, idx_r, jb, nrbl, irow, pc,iknsupc;
-    int_t  lptr1_tmp, idx_i, idx_v,m; 
+    int_t  lptr1_tmp, idx_i, idx_v,m;
     	int_t ready;
     	static int thread_id;
     yes_no_t empty;
@@ -961,10 +961,10 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     aln_d = ceil(CACHELINE/(double)dword);
     aln_i = ceil(CACHELINE/(double)iword);
     int num_thread = 1;
-	
+
 	maxsuper = sp_ienv_dist(3);
-	
-#ifdef _OPENMP	
+
+#ifdef _OPENMP
 	#pragma omp threadprivate(thread_id)
 #endif
 
@@ -984,7 +984,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
 	fflush(stdout);
     }
 #endif
-	
+
     MPI_Barrier( grid->comm );
     t1_sol = SuperLU_timer_();
     t = SuperLU_timer_();
@@ -997,7 +997,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
 	pxerr_dist("PDGSTRS", grid, -*info);
 	return;
     }
-	
+
     /*
      * Initialization.
      */
@@ -1012,14 +1012,14 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
     Linv_bc_ptr = Llu->Linv_bc_ptr;
-    Uinv_bc_ptr = Llu->Uinv_bc_ptr;	
+    Uinv_bc_ptr = Llu->Uinv_bc_ptr;
     nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
 
     stat->utime[SOL_COMM] = 0.0;
     stat->utime[SOL_GEMM] = 0.0;
     stat->utime[SOL_TRSM] = 0.0;
-    stat->utime[SOL_TOT] = 0.0;	
-	
+    stat->utime[SOL_TOT] = 0.0;
+
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter pdgstrs()");
 #endif
@@ -1059,39 +1059,39 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
     sizelsum = (((size_t)ldalsum)*nrhs + nlb*LSUM_H);
     sizelsum = ((sizelsum + (aln_d - 1)) / aln_d) * aln_d;
-	
+
 #ifdef _OPENMP
     if ( !(lsum = (double*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(double))))
-	ABORT("Malloc fails for lsum[].");	
+	ABORT("Malloc fails for lsum[].");
 #pragma omp parallel default(shared) private(ii)
     {
 	for (ii=0; ii<sizelsum; ii++)
     	lsum[thread_id*sizelsum+ii]=zero;
     }
-#else	
+#else
     if ( !(lsum = (double*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(double))))
   	    ABORT("Malloc fails for lsum[].");
     for ( ii=0; ii < sizelsum*num_thread; ii++ )
-	lsum[ii]=zero;		
-#endif	
-    if ( !(x = (double*)SUPERLU_MALLOC((ldalsum * nrhs + nlb * XK_H) * sizeof(double))) ) 	
+	lsum[ii]=zero;
+#endif
+    if ( !(x = (double*)SUPERLU_MALLOC((ldalsum * nrhs + nlb * XK_H) * sizeof(double))) )
 	ABORT("Calloc fails for x[].");
-    
-	
+
+
     sizertemp=ldalsum * nrhs;
     sizertemp = ((sizertemp + (aln_d - 1)) / aln_d) * aln_d;
     if ( !(rtemp = (double*)SUPERLU_MALLOC((sizertemp*num_thread + 1) * sizeof(double))) )
-	ABORT("Malloc fails for rtemp[].");		
+	ABORT("Malloc fails for rtemp[].");
 #ifdef _OPENMP
 #pragma omp parallel default(shared) private(ii)
     {
 	for ( ii=0; ii<sizertemp; ii++ )
-		rtemp[thread_id*sizertemp+ii]=zero;			
+		rtemp[thread_id*sizertemp+ii]=zero;
     }
-#else	
+#else
     for ( ii=0; ii<sizertemp*num_thread; ii++ )
-	rtemp[ii]=zero;			
-#endif	
+	rtemp[ii]=zero;
+#endif
 
     if ( !(stat_loc = (SuperLUStat_t**) SUPERLU_MALLOC(num_thread*sizeof(SuperLUStat_t*))) )
 	ABORT("Malloc fails for stat_loc[].");
@@ -1101,7 +1101,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
 	PStatInit(stat_loc[i]);
     }
 
-#if ( DEBUGlevel>=2 )   
+#if ( DEBUGlevel>=2 )
     /* Dump the L factor using matlab triple-let format. */
     dDumpLblocks(iam, nsupers, grid, Glu_persist, Llu);
 #endif
@@ -1110,7 +1110,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
      * Forward solve Ly = b.
      *---------------------------------------------------*/
     /* Redistribute B into X on the diagonal processes. */
-    pdReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, 
+    pdReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x,
 			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
 
 #if ( PRNTlevel>=2 )
@@ -1118,12 +1118,12 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
     if ( !iam) printf(".. B to X redistribute time\t%8.4f\n", t);
     fflush(stdout);
     t = SuperLU_timer_();
-#endif	
+#endif
 
     /* Set up the headers in lsum[]. */
-#ifdef _OPENMP	
+#ifdef _OPENMP
 	#pragma omp simd lastprivate(krow,lk,il)
-#endif		
+#endif
     for (k = 0; k < nsupers; ++k) {
 	krow = PROW( k, grid );
 	if ( myrow == krow ) {
@@ -1135,16 +1135,16 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
 
 	/* ---------------------------------------------------------
 	   Initialize the async Bcast trees on all processes.
-	   --------------------------------------------------------- */		
+	   --------------------------------------------------------- */
 	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
 
 	nbtree = 0;
 	for (lk=0;lk<nsupers_j;++lk){
 		if(LBtree_ptr[lk]!=NULL){
-			// printf("LBtree_ptr lk %5d\n",lk); 
-			if(BcTree_IsRoot(LBtree_ptr[lk],'d')==NO){			
+			// printf("LBtree_ptr lk %5d\n",lk);
+			if(BcTree_IsRoot(LBtree_ptr[lk],'d')==NO){
 				nbtree++;
-				if(BcTree_getDestCount(LBtree_ptr[lk],'d')>0)nfrecvx_buf++;				  
+				if(BcTree_getDestCount(LBtree_ptr[lk],'d')>0)nfrecvx_buf++;
 			}
 			BcTree_allocateRequest(LBtree_ptr[lk],'d');
 		}
@@ -1157,24 +1157,24 @@ pdgstrs(int_t n, LUstruct_t *LUstruct,
 	nrtree = 0;
 	nleaf=0;
 	nfrecvmod=0;
-	
-	
-	
+
+
+
 if(procs==1){
 	for (lk=0;lk<nsupers_i;++lk){
 		gb = myrow+lk*grid->nprow;  /* not sure */
 		if(gb<nsupers){
 			if (fmod[lk*aln_i]==0){
-				leafsups[nleaf]=gb;				
+				leafsups[nleaf]=gb;
 				++nleaf;
 			}
 		}
 	}
-}else{	
+}else{
 	for (lk=0;lk<nsupers_i;++lk){
 		if(LRtree_ptr[lk]!=NULL){
 			nrtree++;
-			RdTree_allocateRequest(LRtree_ptr[lk],'d');			
+			RdTree_allocateRequest(LRtree_ptr[lk],'d');
 			frecv[lk] = RdTree_GetDestCount(LRtree_ptr[lk],'d');
 			nfrecvmod += frecv[lk];
 		}else{
@@ -1183,27 +1183,29 @@ if(procs==1){
 				kcol = PCOL( gb, grid );
 				if(mycol==kcol) { /* Diagonal process */
 					if (fmod[lk*aln_i]==0){
-						leafsups[nleaf]=gb;				
+						leafsups[nleaf]=gb;
 						++nleaf;
 					}
 				}
 			}
 		}
-	}	
-}	
-	
-	
-#ifdef _OPENMP	
+	}
+}
+
+
+#ifdef _OPENMP
 #pragma omp simd
 #endif
 	for (i = 0; i < nlb; ++i) fmod[i*aln_i] += frecv[i];
 
 	if ( !(recvbuf_BC_fwd = (double*)SUPERLU_MALLOC(maxrecvsz*(nfrecvx+1) * sizeof(double))) )  // this needs to be optimized for 1D row mapping
-		ABORT("Malloc fails for recvbuf_BC_fwd[].");	
-	nfrecvx_buf=0;			
+		ABORT("Malloc fails for recvbuf_BC_fwd[].");
+	nfrecvx_buf=0;
 
 	log_memory(nlb*aln_i*iword+nlb*iword+(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*2.0*iword+ nsupers_i*iword + sizelsum*num_thread * dword + (ldalsum * nrhs + nlb * XK_H) *dword + (sizertemp*num_thread + 1)*dword+maxrecvsz*(nfrecvx+1)*dword, stat);	//account for fmod, frecv, leaf_send, root_send, leafsups, recvbuf_BC_fwd	, lsum, x, rtemp
-	
+
+
+
 #if ( DEBUGlevel>=2 )
 	printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n,  nbtree %4d\n,  nrtree %4d\n",
 			iam, nfrecvx, nfrecvmod, nleaf, nbtree, nrtree);
@@ -1214,13 +1216,13 @@ if(procs==1){
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Setup L-solve time\t%8.4f\n", t);
 	fflush(stdout);
-	MPI_Barrier( grid->comm );	
+	MPI_Barrier( grid->comm );
 	t = SuperLU_timer_();
 #endif
 
 #if ( VAMPIR>=1 )
-	// VT_initialize(); 
-	VT_traceon();	
+	// VT_initialize();
+	VT_traceon();
 #endif
 
 #ifdef USE_VTUNE
@@ -1238,27 +1240,27 @@ if(procs==1){
 
 
 #ifdef _OPENMP
-#pragma omp parallel default (shared) 
+#pragma omp parallel default (shared)
 #endif
-	{	
+	{
 		{
-		
+
             if (Llu->inv == 1) { /* Diagonal is inverted. */
 
 #ifdef _OPENMP
-#pragma	omp	for firstprivate(nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait	
+#pragma	omp	for firstprivate(nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait
 #endif
 			for (jj=0;jj<nleaf;jj++){
 				k=leafsups[jj];
 
 				// #ifdef _OPENMP
-				// #pragma	omp	task firstprivate (k,nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,knsupc,lk,luptr,lsub,nsupr,lusup,thread_id,t1,t2,Linv,i,lib,rtemp_loc)	 	
+				// #pragma	omp	task firstprivate (k,nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,knsupc,lk,luptr,lsub,nsupr,lusup,thread_id,t1,t2,Linv,i,lib,rtemp_loc)
 				// #endif
 				{
 
 #if ( PROFlevel>=1 )
 					TIC(t1);
-#endif	 
+#endif
 					rtemp_loc = &rtemp[sizertemp* thread_id];
 
 
@@ -1285,15 +1287,15 @@ if(procs==1){
 					dgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
 							&alpha, Linv, &knsupc, &x[ii],
 							&knsupc, &beta, rtemp_loc, &knsupc );
-#endif	
+#endif
 
 				#ifdef _OPENMP
 					#pragma omp simd
-				#endif		   
+				#endif
 					for (i=0 ; i<knsupc*nrhs ; i++){
 						x[ii+i] = rtemp_loc[i];
-					}		
-					
+					}
+
 					// for (i=0 ; i<knsupc*nrhs ; i++){
 					// printf("x_l: %f\n",x[ii+i]);
 					// fflush(stdout);
@@ -1304,11 +1306,11 @@ if(procs==1){
 					TOC(t2, t1);
 					stat_loc[thread_id]->utime[SOL_TRSM] += t2;
 
-#endif	
+#endif
 
 					stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
-			
-					
+
+
 					// --nleaf;
 #if ( DEBUGlevel>=2 )
 					printf("(%2d) Solve X[%2d]\n", iam, k);
@@ -1318,9 +1320,9 @@ if(procs==1){
 					 * Send Xk to process column Pc[k].
 					 */
 
-					if(LBtree_ptr[lk]!=NULL){ 
+					if(LBtree_ptr[lk]!=NULL){
 						lib = LBi( k, grid ); /* Local block number, row-wise. */
-						ii = X_BLK( lib );	
+						ii = X_BLK( lib );
 
 #ifdef _OPENMP
 #pragma omp atomic capture
@@ -1329,11 +1331,11 @@ if(procs==1){
 						leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
 						// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'d');
 					}
-				}		
+				}
 			}
 	} else { /* Diagonal is not inverted. */
 #ifdef _OPENMP
-#pragma	omp	for firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait	
+#pragma	omp	for firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait
 #endif
 	    for (jj=0;jj<nleaf;jj++) {
 		k=leafsups[jj];
@@ -1341,7 +1343,7 @@ if(procs==1){
 
 #if ( PROFlevel>=1 )
 		    TIC(t1);
-#endif	 
+#endif
 		    rtemp_loc = &rtemp[sizertemp* thread_id];
 
 		    knsupc = SuperSize( k );
@@ -1358,13 +1360,13 @@ if(procs==1){
    		    STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 				lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		    dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-				lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);	
+		    dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+				lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
- 		    dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+ 		    dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 					lusup, &nsupr, &x[ii], &knsupc);
 #endif
-				
+
 		// for (i=0 ; i<knsupc*nrhs ; i++){
 		// printf("x_l: %f\n",x[ii+i]);
 		// fflush(stdout);
@@ -1375,10 +1377,10 @@ if(procs==1){
 		    TOC(t2, t1);
 		    stat_loc[thread_id]->utime[SOL_TRSM] += t2;
 
-#endif	
+#endif
 
 		    stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
-			
+
 		    // --nleaf;
 #if ( DEBUGlevel>=2 )
 		    printf("(%2d) Solve X[%2d]\n", iam, k);
@@ -1388,9 +1390,9 @@ if(procs==1){
 		     * Send Xk to process column Pc[k].
 		     */
 
-		    if (LBtree_ptr[lk]!=NULL) { 
+		    if (LBtree_ptr[lk]!=NULL) {
 			lib = LBi( k, grid ); /* Local block number, row-wise. */
-			ii = X_BLK( lib );	
+			ii = X_BLK( lib );
 
 #ifdef _OPENMP
 #pragma omp atomic capture
@@ -1398,10 +1400,10 @@ if(procs==1){
 			nleaf_send_tmp = ++nleaf_send;
 			leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
 		    }
-		    } /* end a block */		
+		    } /* end a block */
 		} /* end for jj ... */
 	    } /* end else ... diagonal is not invedted */
-	  }	
+	  }
 	}
 
 	jj=0;
@@ -1422,7 +1424,7 @@ if(procs==1){
 #endif
 
 					for (jj=0;jj<nleaf;jj++){
-						k=leafsups[jj];		
+						k=leafsups[jj];
 
 						{
 							/* Diagonal process */
@@ -1432,8 +1434,8 @@ if(procs==1){
 							 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
 							 */
 							dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, k,
-									fmod, xsup, grid, Llu, 
-									stat_loc, leaf_send, &nleaf_send,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);	
+									fmod, xsup, grid, Llu,
+									stat_loc, leaf_send, &nleaf_send,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
 						}
 
 						// } /* if diagonal process ... */
@@ -1447,7 +1449,7 @@ if(procs==1){
 				if(lk>=0){ // this is a bcast forwarding
 					gb = mycol+lk*grid->npcol;  /* not sure */
 					lib = LBi( gb, grid ); /* Local block number, row-wise. */
-					ii = X_BLK( lib );			
+					ii = X_BLK( lib );
 					BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'d')*nrhs+XK_H,'d');
 				}else{ // this is a reduce forwarding
 					lk = -lk - 1;
@@ -1468,25 +1470,25 @@ if(procs==1){
 			   ----------------------------------------------------------- */
 
 #ifdef _OPENMP
-#pragma omp parallel default (shared) 
+#pragma omp parallel default (shared)
 #endif
-			{	
+			{
 #ifdef _OPENMP
-#pragma omp master 
+#pragma omp master
 #endif
-				{									 
+				{
 					for ( nfrecv =0; nfrecv<nfrecvx+nfrecvmod;nfrecv++) { /* While not finished. */
 						thread_id = 0;
 #if ( PROFlevel>=1 )
 						TIC(t1);
 						// msgcnt[1] = maxrecvsz;
-#endif	
+#endif
 
 						recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz];
 
 						/* Receive a message. */
 						MPI_Recv( recvbuf0, maxrecvsz, MPI_DOUBLE,
-								MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );	 	
+								MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
 						// MPI_Irecv(recvbuf0,maxrecvsz,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,grid->comm,&req);
 						// ready=0;
 						// while(ready==0){
@@ -1494,18 +1496,18 @@ if(procs==1){
 						// #pragma omp taskyield
 						// }
 
-#if ( PROFlevel>=1 )		 
+#if ( PROFlevel>=1 )
 						TOC(t2, t1);
 						stat_loc[thread_id]->utime[SOL_COMM] += t2;
 
 						msg_cnt += 1;
-						msg_vol += maxrecvsz * dword;			
-#endif					  
+						msg_vol += maxrecvsz * dword;
+#endif
+
+						{
 
-						{  
-							
 							k = *recvbuf0;
-		
+
 #if ( DEBUGlevel>=2 )
 							printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
 #endif
@@ -1518,13 +1520,13 @@ if(procs==1){
 
 									if(BcTree_getDestCount(LBtree_ptr[lk],'d')>0){
 
-										BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'d')*nrhs+XK_H,'d');	
+										BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'d')*nrhs+XK_H,'d');
 										// nfrecvx_buf++;
 									}
 
 									/*
 									 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-									 */	  
+									 */
 
 									lk = LBj( k, grid ); /* Local block number, column-wise. */
 									lsub = Lrowind_bc_ptr[lk];
@@ -1539,28 +1541,28 @@ if(procs==1){
 										}else{
 											nb   = lsub[0];
 											knsupc = SuperSize( k );
-											xin = &recvbuf0[XK_H] ;					
+											xin = &recvbuf0[XK_H] ;
 										}
 
 										dlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k,
 												fmod, nb, xsup, grid, Llu,
-												stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);	
+												stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
 
 									} /* if lsub */
 								}
 
 							}else if(status.MPI_TAG==RD_L){
-								// --nfrecvmod;		  
+								// --nfrecvmod;
 								lk = LBi( k, grid ); /* Local block number, row-wise. */
 
 								knsupc = SuperSize( k );
 								tempv = &recvbuf0[LSUM_H];
-								il = LSUM_BLK( lk );		  
+								il = LSUM_BLK( lk );
 								RHS_ITERATE(j) {
 									for (i = 0; i < knsupc; ++i)
 										lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc];
-										
-								}			
+
+								}
 
 								// #ifdef _OPENMP
 								// #pragma omp atomic capture
@@ -1569,14 +1571,14 @@ if(procs==1){
 								{
 									thread_id = 0;
 									rtemp_loc = &rtemp[sizertemp* thread_id];
-									if ( fmod_tmp==0 ) {	  
+									if ( fmod_tmp==0 ) {
 										if(RdTree_IsRoot(LRtree_ptr[lk],'d')==YES){
 											// ii = X_BLK( lk );
 											knsupc = SuperSize( k );
 											for (ii=1;ii<num_thread;ii++)
 											#ifdef _OPENMP
 												#pragma omp simd
-											#endif	
+											#endif
 												for (jj=0;jj<knsupc*nrhs;jj++)
 													lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
@@ -1584,8 +1586,8 @@ if(procs==1){
 											RHS_ITERATE(j)
 												#ifdef _OPENMP
 													#pragma omp simd
-												#endif												
-												for (i = 0; i < knsupc; ++i)	
+												#endif
+												for (i = 0; i < knsupc; ++i)
 													x[i + ii + j*knsupc] += lsum[i + il + j*knsupc ];
 
 											// fmod[lk] = -1; /* Do not solve X[k] in the future. */
@@ -1596,10 +1598,10 @@ if(procs==1){
 
 #if ( PROFlevel>=1 )
 											TIC(t1);
-#endif			  
+#endif
 
 											if(Llu->inv == 1){
-												Linv = Linv_bc_ptr[lk];		  
+												Linv = Linv_bc_ptr[lk];
 #ifdef _CRAY
 												SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
 														&alpha, Linv, &knsupc, &x[ii],
@@ -1612,23 +1614,23 @@ if(procs==1){
 												dgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
 														&alpha, Linv, &knsupc, &x[ii],
 														&knsupc, &beta, rtemp_loc, &knsupc );
-#endif			   
+#endif
 												#ifdef _OPENMP
 													#pragma omp simd
 												#endif
 												for (i=0 ; i<knsupc*nrhs ; i++){
 													x[ii+i] = rtemp_loc[i];
-												}		
+												}
 											}
 											else{
 #ifdef _CRAY
 												STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 														lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-												dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-														lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);		
+												dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+														lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-												dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+												dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 														lusup, &nsupr, &x[ii], &knsupc);
 #endif
 											}
@@ -1636,7 +1638,7 @@ if(procs==1){
 #if ( PROFlevel>=1 )
 											TOC(t2, t1);
 											stat_loc[thread_id]->utime[SOL_TRSM] += t2;
-#endif	
+#endif
 
 											stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
 #if ( DEBUGlevel>=2 )
@@ -1645,10 +1647,10 @@ if(procs==1){
 
 											/*
 											 * Send Xk to process column Pc[k].
-											 */						  
-											if(LBtree_ptr[lk]!=NULL){ 
+											 */
+											if(LBtree_ptr[lk]!=NULL){
 												BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'d')*nrhs+XK_H,'d');
-											}		  
+											}
 
 
 											/*
@@ -1662,31 +1664,31 @@ if(procs==1){
 												nb = lsub[0] - 1;
 												knsupc = SuperSize( k );
 												ii = X_BLK( LBi( k, grid ) );
-												xin = &x[ii];		
+												xin = &x[ii];
 												dlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k,
 														fmod, nb, xsup, grid, Llu,
-														stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);	
+														stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
 											} /* if lsub */
 											// }
 
 									}else{
 
-										il = LSUM_BLK( lk );		  
+										il = LSUM_BLK( lk );
 										knsupc = SuperSize( k );
 
 										for (ii=1;ii<num_thread;ii++)
 											#ifdef _OPENMP
 												#pragma omp simd
-											#endif										
+											#endif
 											for (jj=0;jj<knsupc*nrhs;jj++)
 												lsum[il + jj] += lsum[il + jj + ii*sizelsum];
-										RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il-LSUM_H],RdTree_GetMsgSize(LRtree_ptr[lk],'d')*nrhs+LSUM_H,'d'); 
-									}  
+										RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il-LSUM_H],RdTree_GetMsgSize(LRtree_ptr[lk],'d')*nrhs+LSUM_H,'d');
+									}
 
 								}
 
-							}					
-						} /* check Tag */		  
+							}
+						} /* check Tag */
 					}
 
 				} /* while not finished ... */
@@ -1706,9 +1708,9 @@ if(procs==1){
 		MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE,
 				MPI_MAX, 0, grid->comm);
 		if ( !iam ) {
-			printf(".. L-solve time (MAX) \t%8.4f\n", tmax);	
+			printf(".. L-solve time (MAX) \t%8.4f\n", tmax);
 			fflush(stdout);
-		}	
+		}
 
 
 		t = SuperLU_timer_();
@@ -1739,29 +1741,28 @@ if(procs==1){
 		SUPERLU_FREE(leaf_send);
 		SUPERLU_FREE(leafsups);
 		SUPERLU_FREE(recvbuf_BC_fwd);
+		log_memory(-nlb*aln_i*iword-nlb*iword-(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword- nsupers_i*iword -maxrecvsz*(nfrecvx+1)*dword, stat);	//account for fmod, frecv, leaf_send, leafsups, recvbuf_BC_fwd
 
-		log_memory(-nlb*aln_i*iword-nlb*iword-(CEILING( nsupers, Pr )-CEILING( nsupers, Pc ))*aln_i*iword- nsupers_i*iword -maxrecvsz*(nfrecvx+1)*dword, stat);	//account for fmod, frecv, leaf_send, leafsups, recvbuf_BC_fwd				
-		
 		for (lk=0;lk<nsupers_j;++lk){
 			if(LBtree_ptr[lk]!=NULL){
-				// if(BcTree_IsRoot(LBtree_ptr[lk],'d')==YES){			
-				BcTree_waitSendRequest(LBtree_ptr[lk],'d');		
+				// if(BcTree_IsRoot(LBtree_ptr[lk],'d')==YES){
+				BcTree_waitSendRequest(LBtree_ptr[lk],'d');
 				// }
 				// deallocate requests here
 			}
 		}
 
 		for (lk=0;lk<nsupers_i;++lk){
-			if(LRtree_ptr[lk]!=NULL){		
-				RdTree_waitSendRequest(LRtree_ptr[lk],'d');		
+			if(LRtree_ptr[lk]!=NULL){
+				RdTree_waitSendRequest(LRtree_ptr[lk],'d');
 				// deallocate requests here
 			}
-		}		
+		}
 		MPI_Barrier( grid->comm );
 
-#if ( VAMPIR>=1 )	
-		VT_traceoff();	
-		VT_finalize(); 
+#if ( VAMPIR>=1 )
+		VT_traceoff();
+		VT_finalize();
 #endif
 
 
@@ -1771,8 +1772,8 @@ if(procs==1){
 		 * The Y components from the forward solve is already
 		 * on the diagonal processes.
 	 *---------------------------------------------------*/
-		 
-		 
+
+
 		/* Save the count to be altered so it can be used by
 		   subsequent call to PDGSTRS. */
 		if ( !(bmod = intMalloc_dist(nlb*aln_i)) )
@@ -1785,7 +1786,7 @@ if(procs==1){
 		k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
 
 		/* Re-initialize lsum to zero. Each block header is already in place. */
-		
+
 #ifdef _OPENMP
 
 #pragma omp parallel default(shared) private(ii)
@@ -1794,9 +1795,9 @@ if(procs==1){
 			lsum[thread_id*sizelsum+ii]=zero;
 	}
     /* Set up the headers in lsum[]. */
-#ifdef _OPENMP	
+#ifdef _OPENMP
 	#pragma omp simd lastprivate(krow,lk,il)
-#endif		
+#endif
     for (k = 0; k < nsupers; ++k) {
 	krow = PROW( k, grid );
 	if ( myrow == krow ) {
@@ -1804,9 +1805,9 @@ if(procs==1){
 	    il = LSUM_BLK( lk );
 	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
 	}
-    }	
+    }
 
-#else	
+#else
 	for (k = 0; k < nsupers; ++k) {
 		krow = PROW( k, grid );
 		if ( myrow == krow ) {
@@ -1814,15 +1815,15 @@ if(procs==1){
 			lk = LBi( k, grid );
 			il = LSUM_BLK( lk );
 			dest = &lsum[il];
-			
-			for (jj = 0; jj < num_thread; ++jj) {						
+
+			for (jj = 0; jj < num_thread; ++jj) {
 				RHS_ITERATE(j) {
 					for (i = 0; i < knsupc; ++i) dest[i + j*knsupc + jj*sizelsum] = zero;
-				}	
-			}	
+				}
+			}
 		}
 	}
-#endif		
+#endif
 
 #if ( DEBUGlevel>=2 )
 		for (p = 0; p < Pr*Pc; ++p) {
@@ -1835,7 +1836,7 @@ if(procs==1){
 						for (i = 0; i < Urbs[lb]; ++i)
 							printf("(%2d) .. row blk %2d:\
 									lbnum %d, indpos %d, valpos %d\n",
-									iam, i, 
+									iam, i,
 									Ucb_indptr[lb][i].lbnum,
 									Ucb_indptr[lb][i].indpos,
 									Ucb_valptr[lb][i]);
@@ -1863,16 +1864,16 @@ if(procs==1){
 
 	/* ---------------------------------------------------------
 	   Initialize the async Bcast trees on all processes.
-	   --------------------------------------------------------- */		
+	   --------------------------------------------------------- */
 	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
 
 	nbtree = 0;
 	for (lk=0;lk<nsupers_j;++lk){
 		if(UBtree_ptr[lk]!=NULL){
-			// printf("UBtree_ptr lk %5d\n",lk); 
-			if(BcTree_IsRoot(UBtree_ptr[lk],'d')==NO){			
+			// printf("UBtree_ptr lk %5d\n",lk);
+			if(BcTree_IsRoot(UBtree_ptr[lk],'d')==NO){
 				nbtree++;
-				if(BcTree_getDestCount(UBtree_ptr[lk],'d')>0)nbrecvx_buf++;				  
+				if(BcTree_getDestCount(UBtree_ptr[lk],'d')>0)nbrecvx_buf++;
 			}
 			BcTree_allocateRequest(UBtree_ptr[lk],'d');
 		}
@@ -1889,7 +1890,7 @@ if(procs==1){
 			// printf("here lk %5d myid %5d\n",lk,iam);
 			// fflush(stdout);
 			nrtree++;
-			RdTree_allocateRequest(URtree_ptr[lk],'d');			
+			RdTree_allocateRequest(URtree_ptr[lk],'d');
 			brecv[lk] = RdTree_GetDestCount(URtree_ptr[lk],'d');
 			nbrecvmod += brecv[lk];
 		}else{
@@ -1898,27 +1899,27 @@ if(procs==1){
 				kcol = PCOL( gb, grid );
 				if(mycol==kcol) { /* Diagonal process */
 					if (bmod[lk*aln_i]==0){
-						rootsups[nroot]=gb;				
+						rootsups[nroot]=gb;
 						++nroot;
 					}
 				}
 			}
 		}
-	}	
+	}
 
-	#ifdef _OPENMP	
+	#ifdef _OPENMP
 	#pragma omp simd
 	#endif
 	for (i = 0; i < nlb; ++i) bmod[i*aln_i] += brecv[i];
 	// for (i = 0; i < nlb; ++i)printf("bmod[i]: %5d\n",bmod[i]);
-	
+
 
 	if ( !(recvbuf_BC_fwd = (double*)SUPERLU_MALLOC(maxrecvsz*(nbrecvx+1) * sizeof(double))) )  // this needs to be optimized for 1D row mapping
-		ABORT("Malloc fails for recvbuf_BC_fwd[].");	
-	nbrecvx_buf=0;			
+		ABORT("Malloc fails for recvbuf_BC_fwd[].");
+	nbrecvx_buf=0;
+
+	log_memory(nlb*aln_i*iword+nlb*iword + nsupers_i*iword + maxrecvsz*(nbrecvx+1)*dword, stat);	//account for bmod, brecv, rootsups, recvbuf_BC_fwd
 
-	log_memory(nlb*aln_i*iword+nlb*iword + nsupers_i*iword + maxrecvsz*(nbrecvx+1)*dword, stat);	//account for bmod, brecv, rootsups, recvbuf_BC_fwd	
-	
 #if ( DEBUGlevel>=2 )
 	printf("(%2d) nbrecvx %4d,  nbrecvmod %4d,  nroot %4d\n,  nbtree %4d\n,  nrtree %4d\n",
 			iam, nbrecvx, nbrecvmod, nroot, nbtree, nrtree);
@@ -1930,7 +1931,7 @@ if(procs==1){
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Setup U-solve time\t%8.4f\n", t);
 	fflush(stdout);
-	MPI_Barrier( grid->comm );	
+	MPI_Barrier( grid->comm );
 	t = SuperLU_timer_();
 #endif
 
@@ -1939,35 +1940,35 @@ if(procs==1){
 		 */
 #if ( DEBUGlevel>=2 )
 		printf("(%2d) nroot %4d\n", iam, nroot);
-		fflush(stdout);				
+		fflush(stdout);
 #endif
-		
-		
+
+
 
 #ifdef _OPENMP
-#pragma omp parallel default (shared) 
+#pragma omp parallel default (shared)
 #endif
-	{	
+	{
 #ifdef _OPENMP
 #pragma omp master
 #endif
 		{
 #ifdef _OPENMP
-#pragma	omp	taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp) nogroup		
-#endif		
+#pragma	omp	taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp) nogroup
+#endif
 		for (jj=0;jj<nroot;jj++){
-			k=rootsups[jj];	
+			k=rootsups[jj];
 
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif	
+#endif
 
 			rtemp_loc = &rtemp[sizertemp* thread_id];
 
 
-			
+
 			knsupc = SuperSize( k );
-			lk = LBi( k, grid ); /* Local block number, row-wise. */		
+			lk = LBi( k, grid ); /* Local block number, row-wise. */
 
 			// bmod[lk] = -1;       /* Do not solve X[k] in the future. */
 			ii = X_BLK( lk );
@@ -1992,22 +1993,22 @@ if(procs==1){
 				dgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
 						&alpha, Uinv, &knsupc, &x[ii],
 						&knsupc, &beta, rtemp_loc, &knsupc );
-#endif			   
+#endif
 				#ifdef _OPENMP
 					#pragma omp simd
 				#endif
 				for (i=0 ; i<knsupc*nrhs ; i++){
 					x[ii+i] = rtemp_loc[i];
-				}		
+				}
 			}else{
 #ifdef _CRAY
 				STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 						lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-				dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-						lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);	
+				dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+						lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-				dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+				dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 						lusup, &nsupr, &x[ii], &knsupc);
 #endif
 			}
@@ -2024,7 +2025,7 @@ if(procs==1){
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat_loc[thread_id]->utime[SOL_TRSM] += t2;
-#endif	
+#endif
 			stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
 
 #if ( DEBUGlevel>=2 )
@@ -2035,46 +2036,46 @@ if(procs==1){
 			 * Send Xk to process column Pc[k].
 			 */
 
-			if(UBtree_ptr[lk]!=NULL){ 
+			if(UBtree_ptr[lk]!=NULL){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 				nroot_send_tmp = ++nroot_send;
 				root_send[(nroot_send_tmp-1)*aln_i] = lk;
-				
+
 			}
 		} /* for k ... */
 	}
 }
 
-		
+
 #ifdef _OPENMP
-#pragma omp parallel default (shared) 
+#pragma omp parallel default (shared)
 #endif
-	{			
+	{
 #ifdef _OPENMP
 #pragma omp master
 #endif
 		{
 #ifdef _OPENMP
-#pragma	omp	taskloop private (ii,jj,k,lk) nogroup		
-#endif		
+#pragma	omp	taskloop private (ii,jj,k,lk) nogroup
+#endif
 		for (jj=0;jj<nroot;jj++){
-			k=rootsups[jj];	
-			lk = LBi( k, grid ); /* Local block number, row-wise. */		
+			k=rootsups[jj];
+			lk = LBi( k, grid ); /* Local block number, row-wise. */
 			ii = X_BLK( lk );
 			lk = LBj( k, grid ); /* Local block number, column-wise */
 
 			/*
 			 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
 			 */
-			if ( Urbs[lk] ) 
-				dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,Urbs2, 
+			if ( Urbs[lk] )
+				dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,
 						Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-						send_req, stat_loc, root_send, &nroot_send, sizelsum,sizertemp,thread_id,num_thread);
-									
+						stat_loc, root_send, &nroot_send, sizelsum,sizertemp,thread_id,num_thread);
+
 		} /* for k ... */
-		
+
 	}
 }
 
@@ -2083,7 +2084,7 @@ for (i=0;i<nroot_send;i++){
 	if(lk>=0){ // this is a bcast forwarding
 		gb = mycol+lk*grid->npcol;  /* not sure */
 		lib = LBi( gb, grid ); /* Local block number, row-wise. */
-		ii = X_BLK( lib );			
+		ii = X_BLK( lib );
 		BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'d')*nrhs+XK_H,'d');
 	}else{ // this is a reduce forwarding
 		lk = -lk - 1;
@@ -2098,38 +2099,38 @@ for (i=0;i<nroot_send;i++){
 		 */
 
 #ifdef _OPENMP
-#pragma omp parallel default (shared) 
+#pragma omp parallel default (shared)
 #endif
-	{	
+	{
 #ifdef _OPENMP
-#pragma omp master 
-#endif		 
+#pragma omp master
+#endif
 		for ( nbrecv =0; nbrecv<nbrecvx+nbrecvmod;nbrecv++) { /* While not finished. */
 
 			// printf("iam %4d nbrecv %4d nbrecvx %4d nbrecvmod %4d\n", iam, nbrecv, nbrecvxnbrecvmod);
-			// fflush(stdout);			
-			
-			
-			
+			// fflush(stdout);
+
+
+
 			thread_id = 0;
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif	
+#endif
 
 			recvbuf0 = &recvbuf_BC_fwd[nbrecvx_buf*maxrecvsz];
 
 			/* Receive a message. */
 			MPI_Recv( recvbuf0, maxrecvsz, MPI_DOUBLE,
-					MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );	 	
+					MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
 
-#if ( PROFlevel>=1 )		 
+#if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat_loc[thread_id]->utime[SOL_COMM] += t2;
 
 			msg_cnt += 1;
-			msg_vol += maxrecvsz * dword;			
-#endif	
-		 
+			msg_vol += maxrecvsz * dword;
+#endif
+
 			k = *recvbuf0;
 #if ( DEBUGlevel>=2 )
 			printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
@@ -2139,63 +2140,63 @@ for (i=0;i<nroot_send;i++){
 			if(status.MPI_TAG==BC_U){
 				// --nfrecvx;
 				nbrecvx_buf++;
-				
+
 				lk = LBj( k, grid );    /* local block number */
 
 				if(BcTree_getDestCount(UBtree_ptr[lk],'d')>0){
 
-					BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(UBtree_ptr[lk],'d')*nrhs+XK_H,'d');	
+					BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(UBtree_ptr[lk],'d')*nrhs+XK_H,'d');
 					// nfrecvx_buf++;
 				}
 
 				/*
 				 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-				 */	  
+				 */
 
 				lk = LBj( k, grid ); /* Local block number, column-wise. */
-				dlsum_bmod_inv_master(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, k, bmod, Urbs,Urbs2,
-						Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
-						send_req, stat_loc, sizelsum,sizertemp,thread_id,num_thread);
+				dlsum_bmod_inv_master(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, k, bmod, Urbs,
+						Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+						stat_loc, sizelsum,sizertemp,thread_id,num_thread);
 			}else if(status.MPI_TAG==RD_U){
 
 				lk = LBi( k, grid ); /* Local block number, row-wise. */
-				
+
 				knsupc = SuperSize( k );
 				tempv = &recvbuf0[LSUM_H];
-				il = LSUM_BLK( lk );		  
+				il = LSUM_BLK( lk );
 				RHS_ITERATE(j) {
 					#ifdef _OPENMP
 						#pragma omp simd
-					#endif				
+					#endif
 					for (i = 0; i < knsupc; ++i)
 						lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc];
-							
-				}					
+
+				}
 			// #ifdef _OPENMP
 			// #pragma omp atomic capture
 			// #endif
 				bmod_tmp=--bmod[lk*aln_i];
-				thread_id = 0;									
+				thread_id = 0;
 				rtemp_loc = &rtemp[sizertemp* thread_id];
 				if ( bmod_tmp==0 ) {
-					if(RdTree_IsRoot(URtree_ptr[lk],'d')==YES){							
-						
+					if(RdTree_IsRoot(URtree_ptr[lk],'d')==YES){
+
 						knsupc = SuperSize( k );
 						for (ii=1;ii<num_thread;ii++)
 							#ifdef _OPENMP
 								#pragma omp simd
-							#endif							
+							#endif
 							for (jj=0;jj<knsupc*nrhs;jj++)
-								lsum[il+ jj ] += lsum[il + jj + ii*sizelsum];	
-								
+								lsum[il+ jj ] += lsum[il + jj + ii*sizelsum];
+
 						ii = X_BLK( lk );
 						RHS_ITERATE(j)
 							#ifdef _OPENMP
 								#pragma omp simd
-							#endif							
-							for (i = 0; i < knsupc; ++i)	
+							#endif
+							for (i = 0; i < knsupc; ++i)
 								x[i + ii + j*knsupc] += lsum[i + il + j*knsupc ];
-					
+
 						lk = LBj( k, grid ); /* Local block number, column-wise. */
 						lsub = Lrowind_bc_ptr[lk];
 						lusup = Lnzval_bc_ptr[lk];
@@ -2217,23 +2218,23 @@ for (i=0;i<nroot_send;i++){
 							dgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
 									&alpha, Uinv, &knsupc, &x[ii],
 									&knsupc, &beta, rtemp_loc, &knsupc );
-#endif		
+#endif
 
 							#ifdef _OPENMP
 								#pragma omp simd
 							#endif
 							for (i=0 ; i<knsupc*nrhs ; i++){
 								x[ii+i] = rtemp_loc[i];
-							}		
+							}
 						}else{
 #ifdef _CRAY
 							STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 									lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-							dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-									lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);		
+							dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+									lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-							dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+							dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 									lusup, &nsupr, &x[ii], &knsupc);
 #endif
 						}
@@ -2241,44 +2242,44 @@ for (i=0;i<nroot_send;i++){
 #if ( PROFlevel>=1 )
 							TOC(t2, t1);
 							stat_loc[thread_id]->utime[SOL_TRSM] += t2;
-#endif	
+#endif
 							stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
-		
+
 #if ( DEBUGlevel>=2 )
 						printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
 
 						/*
 						 * Send Xk to process column Pc[k].
-						 */						
-						if(UBtree_ptr[lk]!=NULL){ 
+						 */
+						if(UBtree_ptr[lk]!=NULL){
 							BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'d')*nrhs+XK_H,'d');
-						}							
-						
+						}
+
 
 						/*
-						 * Perform local block modifications: 
+						 * Perform local block modifications:
 						 *         lsum[i] -= U_i,k * X[k]
 						 */
 						if ( Urbs[lk] )
-							dlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,Urbs2,
+							dlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,
 									Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-									send_req, stat_loc, sizelsum,sizertemp,thread_id,num_thread);
+									stat_loc, sizelsum,sizertemp,thread_id,num_thread);
 
 					}else{
-						il = LSUM_BLK( lk );		  
+						il = LSUM_BLK( lk );
 						knsupc = SuperSize( k );
 
 						for (ii=1;ii<num_thread;ii++)
 							#ifdef _OPENMP
 								#pragma omp simd
-							#endif						
+							#endif
 							for (jj=0;jj<knsupc*nrhs;jj++)
-								lsum[il+ jj ] += lsum[il + jj + ii*sizelsum];	
-												
-						RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il-LSUM_H],RdTree_GetMsgSize(URtree_ptr[lk],'d')*nrhs+LSUM_H,'d'); 
-					}						
-				
+								lsum[il+ jj ] += lsum[il + jj + ii*sizelsum];
+
+						RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il-LSUM_H],RdTree_GetMsgSize(URtree_ptr[lk],'d')*nrhs+LSUM_H,'d');
+					}
+
 				}
 			}
 		} /* while not finished ... */
@@ -2290,10 +2291,10 @@ for (i=0;i<nroot_send;i++){
 		MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE,
 				MPI_MAX, 0, grid->comm);
 		if ( !iam ) {
-			printf(".. U-solve time (MAX) \t%8.4f\n", tmax);	
+			printf(".. U-solve time (MAX) \t%8.4f\n", tmax);
 			fflush(stdout);
-		}			
-		t = SuperLU_timer_();			
+		}
+		t = SuperLU_timer_();
 #endif
 
 
@@ -2329,14 +2330,14 @@ for (i=0;i<nroot_send;i++){
 				ScalePermstruct, Glu_persist, grid, SOLVEstruct);
 
 
-#if ( PRNTlevel>=1 )
+#if ( PRNTlevel>=2 )
 		t = SuperLU_timer_() - t;
 		if ( !iam) printf(".. X to B redistribute time\t%8.4f\n", t);
 		t = SuperLU_timer_();
-#endif	
+#endif
 
 
-		double tmp1=0; 
+		double tmp1=0;
 		double tmp2=0;
 		double tmp3=0;
 		double tmp4=0;
@@ -2347,14 +2348,14 @@ for (i=0;i<nroot_send;i++){
 			tmp4 += stat_loc[i]->ops[SOLVE];
 #if ( PRNTlevel>=2 )
 			if(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]);
-#endif	
+#endif
 		}
 
 
 		stat->utime[SOL_TRSM] += tmp1;
 		stat->utime[SOL_GEMM] += tmp2;
 		stat->utime[SOL_COMM] += tmp3;
-		stat->ops[SOLVE]+= tmp4;	  
+		stat->ops[SOLVE]+= tmp4;
 
 
 		/* Deallocate storage. */
@@ -2362,40 +2363,38 @@ for (i=0;i<nroot_send;i++){
 			PStatFree(stat_loc[i]);
 			SUPERLU_FREE(stat_loc[i]);
 		}
-		SUPERLU_FREE(stat_loc);		
+		SUPERLU_FREE(stat_loc);
 		SUPERLU_FREE(rtemp);
 		SUPERLU_FREE(lsum);
 		SUPERLU_FREE(x);
-		
-		
+
+
 		SUPERLU_FREE(bmod);
 		SUPERLU_FREE(brecv);
 		SUPERLU_FREE(root_send);
-		
+
 		SUPERLU_FREE(rootsups);
-		SUPERLU_FREE(recvbuf_BC_fwd);		
+		SUPERLU_FREE(recvbuf_BC_fwd);
+
+		log_memory(-nlb*aln_i*iword-nlb*iword - nsupers_i*iword - (CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword - maxrecvsz*(nbrecvx+1)*dword - sizelsum*num_thread * dword - (ldalsum * nrhs + nlb * XK_H) *dword - (sizertemp*num_thread + 1)*dword, stat);	//account for bmod, brecv, root_send, rootsups, recvbuf_BC_fwd,rtemp,lsum,x
 
-		log_memory(-nlb*aln_i*iword-nlb*iword - nsupers_i*iword - (CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword - maxrecvsz*(nbrecvx+1)*dword - sizelsum*num_thread * dword - (ldalsum * nrhs + nlb * XK_H) *dword + (sizertemp*num_thread - 1)*dword, stat);	//account for bmod, brecv, root_send, rootsups, recvbuf_BC_fwd,rtemp,lsum,x			
-		
 		for (lk=0;lk<nsupers_j;++lk){
 			if(UBtree_ptr[lk]!=NULL){
-				// if(BcTree_IsRoot(LBtree_ptr[lk],'d')==YES){			
-				BcTree_waitSendRequest(UBtree_ptr[lk],'d');		
+				// if(BcTree_IsRoot(LBtree_ptr[lk],'d')==YES){
+				BcTree_waitSendRequest(UBtree_ptr[lk],'d');
 				// }
 				// deallocate requests here
 			}
 		}
 
 		for (lk=0;lk<nsupers_i;++lk){
-			if(URtree_ptr[lk]!=NULL){		
-				RdTree_waitSendRequest(URtree_ptr[lk],'d');		
+			if(URtree_ptr[lk]!=NULL){
+				RdTree_waitSendRequest(URtree_ptr[lk],'d');
 				// deallocate requests here
 			}
-		}		
+		}
 		MPI_Barrier( grid->comm );
 
-		/*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/
-
 
 #if ( PROFlevel>=2 )
 		{
@@ -2417,7 +2416,7 @@ for (i=0;i<nroot_send;i++){
 						msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
 			}
 		}
-#endif	
+#endif
 
     stat->utime[SOLVE] = SuperLU_timer_() - t1_sol;
 
@@ -2426,10 +2425,10 @@ for (i=0;i<nroot_send;i++){
 #endif
 
 
-#if ( PRNTlevel>=2 )	
+#if ( PRNTlevel>=2 )
 	    float for_lu, total, max, avg, temp;
 		superlu_dist_mem_usage_t num_mem_usage;
-		
+
 	    dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
 	    temp = num_mem_usage.total;
 
@@ -2441,13 +2440,13 @@ for (i=0;i<nroot_send;i++){
 		printf("\n** Memory Usage **********************************\n");
                 printf("** Total highmark (MB):\n"
 		       "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
-		       avg * 1e-6,  
+		       avg * 1e-6,
 		       avg / grid->nprow / grid->npcol * 1e-6,
 		       max * 1e-6);
 		printf("**************************************************\n");
 		fflush(stdout);
             }
-#endif	
+#endif
 
 
     return;
diff -pruN 6.1.0+dfsg1-1/SRC/pdgstrs_lsum.c 6.1.1+dfsg1-1/SRC/pdgstrs_lsum.c
--- 6.1.0+dfsg1-1/SRC/pdgstrs_lsum.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdgstrs_lsum.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,26 +1,27 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k]
  *
  * <pre>
- * -- Distributed SuperLU routine (version 2.0) --
+ * -- Distributed SuperLU routine (version 6.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
  * Modified:
  *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
  *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ * February 8, 2019  version 6.1.1
  * </pre>
  */
 
@@ -29,7 +30,7 @@ at the top-level directory.
 
 #ifndef CACHELINE
 #define CACHELINE 64  /* bytes, Xeon Phi KNL, Cori haswell, Edision */
-#endif	
+#endif
 
 #define ISEND_IRECV
 
@@ -39,7 +40,7 @@ at the top-level directory.
 #ifdef _CRAY
 fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
 		   double*, int*, double*, int*);
-fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*,
 		   int*, double*, int*, double*, double*, int*);
 _fcd ftcs1;
 _fcd ftcs2;
@@ -91,11 +92,11 @@ void dlsum_fmod
 #if ( PROFlevel>=1 )
 	double t1, t2;
 	float msg_vol = 0, msg_cnt = 0;
-#endif 
+#endif
 #if ( PROFlevel>=1 )
 	TIC(t1);
-#endif	
-	
+#endif
+
     iam = grid->iam;
     myrow = MYROW( iam, grid );
     lk = LBj( k, grid ); /* Local block number, column-wise. */
@@ -120,7 +121,7 @@ void dlsum_fmod
 	       &knsupc, &beta, rtemp, &nbrow );
 #endif
 	stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs;
-   
+
 	lk = LBi( ik, grid ); /* Local block number, row-wise. */
 	iknsupc = SuperSize( ik );
 	il = LSUM_BLK( lk );
@@ -137,8 +138,8 @@ void dlsum_fmod
 #if ( PROFlevel>=1 )
 		TOC(t2, t1);
 		stat->utime[SOL_GEMM] += t2;
-#endif		
-	
+#endif
+
 	if ( (--fmod[lk])==0 ) { /* Local accumulation done. */
 	    ikcol = PCOL( ik, grid );
 	    p = PNUM( myrow, ikcol, grid );
@@ -173,27 +174,27 @@ void dlsum_fmod
 		    nsupr1 = lsub1[1];
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif				
+#endif
 #ifdef _CRAY
 		    STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 			  lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-		    dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+		    dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 			   lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-		    dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+		    dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 			   lusup1, &nsupr1, &x[ii], &iknsupc);
 #endif
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat->utime[SOL_TRSM] += t2;
-#endif	
+#endif
 
 		    stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
 #if ( DEBUGlevel>=2 )
 		    printf("(%2d) Solve X[%2d]\n", iam, ik);
 #endif
-		
+
 		    /*
 		     * Send Xk to process column Pc[k].
 		     */
@@ -347,10 +348,10 @@ void dlsum_bmod
 		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
 			  lusup, &nsupr, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-		    dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+		    dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-		    dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+		    dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &iknsupc);
 #endif
 		    stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
@@ -430,13 +431,13 @@ void dlsum_fmod_inv
  int_t recurlevel,
  int_t maxsuper,
  int thread_id,
- int num_thread 
+ int num_thread
 )
 {
     double alpha = 1.0, beta = 0.0,malpha=-1.0;
     double *lusup, *lusup1;
     double *dest;
-	double *Linv;/* Inverse of diagonal block */    	
+	double *Linv;/* Inverse of diagonal block */
 	int    iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r,m;
 	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
 	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
@@ -445,7 +446,7 @@ void dlsum_fmod_inv
     int_t  **fsendx_plist = Llu->fsendx_plist;
 	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
 	int thread_id1;
-	flops_t ops_loc=0.0;    	
+	flops_t ops_loc=0.0;
     MPI_Status status;
     int test_flag;
 	yes_no_t done;
@@ -457,22 +458,22 @@ void dlsum_fmod_inv
 	int_t nleaf_send_tmp;
 	int_t lptr;      /* Starting position in lsub[*].                      */
 	int_t luptr;     /* Starting position in lusup[*].                     */
-	int_t iword = sizeof(int_t);	
-	int_t dword = sizeof (double);		
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (double);
 	int_t aln_d,aln_i;
 	aln_d = ceil(CACHELINE/(double)dword);
 	aln_i = ceil(CACHELINE/(double)iword);
 	int   knsupc;    /* Size of supernode k.                               */
 	int_t nlb;       /* Number of L blocks.                                */
-	
-	
+
+
 	knsupc = SuperSize( k );
-	
+
 	lk = LBj( k, grid ); /* Local block number, column-wise. */
 	lsub = Llu->Lrowind_bc_ptr[lk];
 	nlb = lsub[0] - 1;
-	
-	
+
+
 	ldalsum=Llu->ldalsum;
 
 	rtemp_loc = &rtemp[sizertemp* thread_id];
@@ -480,7 +481,7 @@ void dlsum_fmod_inv
 	// #if ( PROFlevel>=1 )
 	double t1, t2, t3, t4;
 	float msg_vol = 0, msg_cnt = 0;
-	// #endif 
+	// #endif
 
 	if(nlb>0){
 
@@ -511,9 +512,9 @@ void dlsum_fmod_inv
 		}
 
 		assert(m>0);
-				
-		if(m>8*maxsuper){ 
-		// if(0){ 
+
+		if(m>8*maxsuper){
+		// if(0){
 
 			// Nchunk=floor(num_thread/2.0)+1;
 			Nchunk=SUPERLU_MIN(num_thread,nlb);
@@ -522,15 +523,15 @@ void dlsum_fmod_inv
 			remainder = nlb % Nchunk;
 
 #ifdef _OPENMP
-#pragma	omp	taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j,nleaf_send_tmp) untied nogroup	
-#endif	
+#pragma	omp	taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j,nleaf_send_tmp) untied nogroup
+#endif
 			for (nn=0;nn<Nchunk;++nn){
 
-#ifdef _OPENMP				 
+#ifdef _OPENMP
 				thread_id1 = omp_get_thread_num ();
 #else
 				thread_id1 = 0;
-#endif		
+#endif
 				rtemp_loc = &rtemp[sizertemp* thread_id1];
 
 				if(nn<remainder){
@@ -545,14 +546,14 @@ void dlsum_fmod_inv
 
 #if ( PROFlevel>=1 )
 					TIC(t1);
-#endif				
+#endif
 					luptr_tmp1 = lloc[lbstart+idx_v];
 					nbrow=0;
-					for (lb = lbstart; lb < lbend; ++lb){ 		
-						lptr1_tmp = lloc[lb+idx_i];		
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
 						nbrow += lsub[lptr1_tmp+1];
 					}
-					
+
 				#ifdef _CRAY
 					SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
 						  &alpha, &lusup[luptr_tmp1], &nsupr, xk,
@@ -568,22 +569,22 @@ void dlsum_fmod_inv
 				#endif
 
 					nbrow_ref=0;
-					for (lb = lbstart; lb < lbend; ++lb){ 		
-						lptr1_tmp = lloc[lb+idx_i];	
-						lptr= lptr1_tmp+2;	
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
+						lptr= lptr1_tmp+2;
 						nbrow1 = lsub[lptr1_tmp+1];
 						ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
 						rel = xsup[ik]; /* Global row index of block ik. */
-	
-						lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+
+						lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 						iknsupc = SuperSize( ik );
 						il = LSUM_BLK( lk );
 
 						RHS_ITERATE(j)
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif						
+							#pragma omp simd
+							#endif
 							for (i = 0; i < nbrow1; ++i) {
 								irow = lsub[lptr+i] - rel; /* Relative row. */
 								lsum[il+irow + j*iknsupc+sizelsum*thread_id1] -= rtemp_loc[nbrow_ref+i + j*nbrow];
@@ -594,7 +595,7 @@ void dlsum_fmod_inv
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
 					stat[thread_id1]->utime[SOL_GEMM] += t2;
-#endif	
+#endif
 
 					for (lb=lbstart;lb<lbend;lb++){
 						lk = lloc[lb+idx_n];
@@ -605,10 +606,10 @@ void dlsum_fmod_inv
 
 						if ( fmod_tmp==0 ) { /* Local accumulation done. */
 
-							lptr1_tmp = lloc[lb+idx_i];	
+							lptr1_tmp = lloc[lb+idx_i];
 
 							ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-							lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+							lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 							iknsupc = SuperSize( ik );
 							il = LSUM_BLK( lk );
@@ -618,26 +619,26 @@ void dlsum_fmod_inv
 							if ( iam != p ) {
 								for (ii=1;ii<num_thread;ii++)
 									#ifdef _OPENMP
-									#pragma omp simd							
+									#pragma omp simd
 									#endif
 									for (jj=0;jj<iknsupc*nrhs;jj++)
 										lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
-								
+
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 								nleaf_send_tmp = ++nleaf_send[0];
-								leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;	
+								leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
 								// RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],'d');
 
 							} else { /* Diagonal process: X[i] += lsum[i]. */
 
 #if ( PROFlevel>=1 )
 								TIC(t1);
-#endif		
+#endif
 								for (ii=1;ii<num_thread;ii++)
 									#ifdef _OPENMP
-									#pragma omp simd							
+									#pragma omp simd
 									#endif
 									for (jj=0;jj<iknsupc*nrhs;jj++)
 										lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
@@ -645,11 +646,11 @@ void dlsum_fmod_inv
 								ii = X_BLK( lk );
 								RHS_ITERATE(j)
 									#ifdef _OPENMP
-									#pragma omp simd							
-									#endif								
-									for (i = 0; i < iknsupc; ++i)	
+									#pragma omp simd
+									#endif
+									for (i = 0; i < iknsupc; ++i)
 										x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc ];
-										
+
 
 								// fmod[lk] = -1; /* Do not solve X[k] in the future. */
 								lk = LBj( ik, grid );/* Local block number, column-wise. */
@@ -659,8 +660,8 @@ void dlsum_fmod_inv
 
 								if(Llu->inv == 1){
 									Linv = Llu->Linv_bc_ptr[lk];
-									
-									
+
+
 #ifdef _CRAY
 									SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
 											&alpha, Linv, &iknsupc, &x[ii],
@@ -673,25 +674,25 @@ void dlsum_fmod_inv
 									dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
 											&alpha, Linv, &iknsupc, &x[ii],
 											&iknsupc, &beta, rtemp_loc, &iknsupc );
-#endif 
+#endif
 									#ifdef _OPENMP
-									#pragma omp simd							
-									#endif 
+									#pragma omp simd
+									#endif
 									for (i=0 ; i<iknsupc*nrhs ; i++){
 										x[ii+i] = rtemp_loc[i];
 									}
-									
+
 								}else{
 #ifdef _CRAY
 									STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 											lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-									dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
-											lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);		   
+									dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+											lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-									dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+									dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 											lusup1, &nsupr1, &x[ii], &iknsupc);
-	  
+
 #endif
 								}
 								// for (i=0 ; i<iknsupc*nrhs ; i++){
@@ -703,13 +704,13 @@ void dlsum_fmod_inv
 								TOC(t2, t1);
 								stat[thread_id1]->utime[SOL_TRSM] += t2;
 
-#endif	
-								
+#endif
+
 								stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
-								
+
 #if ( DEBUGlevel>=2 )
 								printf("(%2d) Solve X[%2d]\n", iam, ik);
-													
+
 #endif
 
 								/*
@@ -729,28 +730,28 @@ void dlsum_fmod_inv
 								 */
 
 								// #ifdef _OPENMP
-								// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) 	
+								// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
 								// #endif
 								{
-					
+
 									dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
 											fmod, xsup,
 											grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread);
-								}		   
+								}
 
 								// } /* if frecv[lk] == 0 */
 						} /* if iam == p */
-					} /* if fmod[lk] == 0 */				
+					} /* if fmod[lk] == 0 */
 				}
 
 			}
 		}
 
-		}else{ 
+		}else{
 
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif	
+#endif
 
 #ifdef _CRAY
 			SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc,
@@ -764,44 +765,44 @@ void dlsum_fmod_inv
 			dgemm_( "N", "N", &m, &nrhs, &knsupc,
 					&alpha, &lusup[luptr_tmp], &nsupr, xk,
 					&knsupc, &beta, rtemp_loc, &m );
-#endif   	
-			
+#endif
+
 			nbrow=0;
-			for (lb = 0; lb < nlb; ++lb){ 		
-				lptr1_tmp = lloc[lb+idx_i];		
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
 				nbrow += lsub[lptr1_tmp+1];
-			}			
+			}
 			nbrow_ref=0;
-			for (lb = 0; lb < nlb; ++lb){ 		
-				lptr1_tmp = lloc[lb+idx_i];	
-				lptr= lptr1_tmp+2;	
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
+				lptr= lptr1_tmp+2;
 				nbrow1 = lsub[lptr1_tmp+1];
 				ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
 				rel = xsup[ik]; /* Global row index of block ik. */
 
-				lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+				lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 				iknsupc = SuperSize( ik );
 				il = LSUM_BLK( lk );
 
 				RHS_ITERATE(j)
 					#ifdef _OPENMP
-					#pragma omp simd							
-					#endif					
+					#pragma omp simd
+					#endif
 					for (i = 0; i < nbrow1; ++i) {
 						irow = lsub[lptr+i] - rel; /* Relative row. */
 
 								lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow];
 					}
 				nbrow_ref+=nbrow1;
-			}			
-			
+			}
+
 			// TOC(t3, t1);
 
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat[thread_id]->utime[SOL_GEMM] += t2;
-#endif		
+#endif
 
 			for (lb=0;lb<nlb;lb++){
 				lk = lloc[lb+idx_n];
@@ -814,10 +815,10 @@ void dlsum_fmod_inv
 
 				if ( fmod_tmp==0 ) { /* Local accumulation done. */
 
-					lptr1_tmp = lloc[lb+idx_i];	
+					lptr1_tmp = lloc[lb+idx_i];
 
 					ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-					lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+					lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 					iknsupc = SuperSize( ik );
 					il = LSUM_BLK( lk );
@@ -826,37 +827,37 @@ void dlsum_fmod_inv
 					if ( iam != p ) {
 						for (ii=1;ii<num_thread;ii++)
 							#ifdef _OPENMP
-							#pragma omp simd							
+							#pragma omp simd
 							#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
-								
+
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 						nleaf_send_tmp = ++nleaf_send[0];
-						leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;						
+						leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
 
 					} else { /* Diagonal process: X[i] += lsum[i]. */
 
 #if ( PROFlevel>=1 )
 						TIC(t1);
-#endif		
+#endif
 						for (ii=1;ii<num_thread;ii++)
 							#ifdef _OPENMP
-							#pragma omp simd							
+							#pragma omp simd
 							#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
-					
+
 						ii = X_BLK( lk );
 						RHS_ITERATE(j)
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif	
-							for (i = 0; i < iknsupc; ++i)	
+							#pragma omp simd
+							#endif
+							for (i = 0; i < iknsupc; ++i)
 								x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc ];
-								
+
 
 						lk = LBj( ik, grid );/* Local block number, column-wise. */
 						lsub1 = Llu->Lrowind_bc_ptr[lk];
@@ -879,34 +880,34 @@ void dlsum_fmod_inv
 									&iknsupc, &beta, rtemp_loc, &iknsupc );
 #endif
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif   
+							#pragma omp simd
+							#endif
 							for (i=0 ; i<iknsupc*nrhs ; i++){
 								x[ii+i] = rtemp_loc[i];
-							}		
+							}
 						}else{
 #ifdef _CRAY
 							STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 									lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-							dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
-									lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);		   
+							dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+									lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-							dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+							dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 									lusup1, &nsupr1, &x[ii], &iknsupc);
 #endif
 						}
-						
+
 							// for (i=0 ; i<iknsupc*nrhs ; i++){
 							// printf("x_lsum: %f\n",x[ii+i]);
 							// fflush(stdout);
 							// }
-						
+
 
 #if ( PROFlevel>=1 )
 						TOC(t2, t1);
 						stat[thread_id]->utime[SOL_TRSM] += t2;
-#endif	
+#endif
 
 						stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
 
@@ -934,25 +935,25 @@ void dlsum_fmod_inv
 						 */
 
 						// #ifdef _OPENMP
-						// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,send_req,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) 	
+						// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1)
 						// #endif
 
 						{
 							dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
 									fmod, xsup,
 									grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
-						}		   
+						}
 
 						// } /* if frecv[lk] == 0 */
 				} /* if iam == p */
-			} /* if fmod[lk] == 0 */				
+			} /* if fmod[lk] == 0 */
 		}
 		// }
 }
 
 	stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc;
 
-	
+
 
 } /* if nlb>0*/
 } /* dLSUM_FMOD_INV */
@@ -993,7 +994,7 @@ void dlsum_fmod_inv_master
     double alpha = 1.0, beta = 0.0,malpha=-1.0;
     double *lusup, *lusup1;
     double *dest;
-	double *Linv;/* Inverse of diagonal block */    	
+	double *Linv;/* Inverse of diagonal block */
 	int    iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r;
 	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
 	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
@@ -1002,8 +1003,8 @@ void dlsum_fmod_inv_master
     int_t  **fsendx_plist = Llu->fsendx_plist;
 	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
 	int thread_id1;
-	int m;	
-	flops_t ops_loc=0.0;    	
+	int m;
+	flops_t ops_loc=0.0;
     MPI_Status status;
     int test_flag;
 	yes_no_t done;
@@ -1011,12 +1012,12 @@ void dlsum_fmod_inv_master
 	RdTree  *LRtree_ptr = Llu->LRtree_ptr;
 	int_t* idx_lsum,idx_lsum1;
 	double *rtemp_loc;
-	int_t ldalsum;	
+	int_t ldalsum;
 	int_t nleaf_send_tmp;
 	int_t lptr;      /* Starting position in lsub[*].                      */
 	int_t luptr;     /* Starting position in lusup[*].                     */
-	int_t iword = sizeof(int_t);	
-	int_t dword = sizeof (double);		
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (double);
 	int_t aln_d,aln_i;
 	aln_d = ceil(CACHELINE/(double)dword);
 	aln_i = ceil(CACHELINE/(double)iword);
@@ -1028,7 +1029,7 @@ void dlsum_fmod_inv_master
 	// #if ( PROFlevel>=1 )
 	double t1, t2, t3, t4;
 	float msg_vol = 0, msg_cnt = 0;
-	// #endif 
+	// #endif
 
 	if(nlb>0){
 
@@ -1037,12 +1038,12 @@ void dlsum_fmod_inv_master
 		lk = LBj( k, grid ); /* Local block number, column-wise. */
 
 		// printf("ya1 %5d k %5d lk %5d\n",thread_id,k,lk);
-		// fflush(stdout);	
+		// fflush(stdout);
 
 		lsub = Llu->Lrowind_bc_ptr[lk];
 
 		// printf("ya2 %5d k %5d lk %5d\n",thread_id,k,lk);
-		// fflush(stdout);	
+		// fflush(stdout);
 
 		lusup = Llu->Lnzval_bc_ptr[lk];
 		lloc = Llu->Lindval_loc_bc_ptr[lk];
@@ -1069,8 +1070,8 @@ void dlsum_fmod_inv_master
 		}
 
 		assert(m>0);
-				
-		if(m>4*maxsuper || nrhs>10){ 
+
+		if(m>4*maxsuper || nrhs>10){
 			// if(m<1){
 			// TIC(t1);
 			Nchunk=num_thread;
@@ -1079,14 +1080,14 @@ void dlsum_fmod_inv_master
 
 #ifdef _OPENMP
 #pragma	omp	taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j) untied
-#endif	
+#endif
 			for (nn=0;nn<Nchunk;++nn){
 
-#ifdef _OPENMP				 
+#ifdef _OPENMP
 				thread_id1 = omp_get_thread_num ();
 #else
 				thread_id1 = 0;
-#endif		
+#endif
 				rtemp_loc = &rtemp[sizertemp* thread_id1];
 
 				if(nn<remainder){
@@ -1101,14 +1102,14 @@ void dlsum_fmod_inv_master
 
 #if ( PROFlevel>=1 )
 					TIC(t1);
-#endif				
+#endif
 					luptr_tmp1 = lloc[lbstart+idx_v];
 					nbrow=0;
-					for (lb = lbstart; lb < lbend; ++lb){ 		
-						lptr1_tmp = lloc[lb+idx_i];		
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
 						nbrow += lsub[lptr1_tmp+1];
 					}
-					
+
 				#ifdef _CRAY
 					SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
 						  &alpha, &lusup[luptr_tmp1], &nsupr, xk,
@@ -1124,22 +1125,22 @@ void dlsum_fmod_inv_master
 				#endif
 
 					nbrow_ref=0;
-					for (lb = lbstart; lb < lbend; ++lb){ 		
-						lptr1_tmp = lloc[lb+idx_i];	
-						lptr= lptr1_tmp+2;	
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
+						lptr= lptr1_tmp+2;
 						nbrow1 = lsub[lptr1_tmp+1];
 						ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
 						rel = xsup[ik]; /* Global row index of block ik. */
-	
-						lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+
+						lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 						iknsupc = SuperSize( ik );
 						il = LSUM_BLK( lk );
 
-						RHS_ITERATE(j)	
-							#ifdef _OPENMP	
+						RHS_ITERATE(j)
+							#ifdef _OPENMP
 								#pragma omp simd lastprivate(irow)
-							#endif							
+							#endif
 							for (i = 0; i < nbrow1; ++i) {
 								irow = lsub[lptr+i] - rel; /* Relative row. */
 								lsum[il+irow + j*iknsupc] -= rtemp_loc[nbrow_ref+i + j*nbrow];
@@ -1150,15 +1151,15 @@ void dlsum_fmod_inv_master
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
 					stat[thread_id1]->utime[SOL_GEMM] += t2;
-#endif	
+#endif
 			}
 		}
 
-		}else{ 
+		}else{
 
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif	
+#endif
 
 #ifdef _CRAY
 			SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc,
@@ -1172,42 +1173,42 @@ void dlsum_fmod_inv_master
 			dgemm_( "N", "N", &m, &nrhs, &knsupc,
 					&alpha, &lusup[luptr_tmp], &nsupr, xk,
 					&knsupc, &beta, rtemp_loc, &m );
-#endif   	
-			
+#endif
+
 			nbrow=0;
-			for (lb = 0; lb < nlb; ++lb){ 		
-				lptr1_tmp = lloc[lb+idx_i];		
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
 				nbrow += lsub[lptr1_tmp+1];
-			}			
+			}
 			nbrow_ref=0;
-			for (lb = 0; lb < nlb; ++lb){ 		
-				lptr1_tmp = lloc[lb+idx_i];	
-				lptr= lptr1_tmp+2;	
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
+				lptr= lptr1_tmp+2;
 				nbrow1 = lsub[lptr1_tmp+1];
 				ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
 				rel = xsup[ik]; /* Global row index of block ik. */
 
-				lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+				lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 				iknsupc = SuperSize( ik );
 				il = LSUM_BLK( lk );
 
 				RHS_ITERATE(j)
-					#ifdef _OPENMP	
+					#ifdef _OPENMP
 						#pragma omp simd lastprivate(irow)
-					#endif					
+					#endif
 					for (i = 0; i < nbrow1; ++i) {
 						irow = lsub[lptr+i] - rel; /* Relative row. */
 
 								lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow];
 					}
 				nbrow_ref+=nbrow1;
-			}			
+			}
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat[thread_id]->utime[SOL_GEMM] += t2;
-#endif	
-		}	
+#endif
+		}
 			// TOC(t3, t1);
 		rtemp_loc = &rtemp[sizertemp* thread_id];
 
@@ -1224,11 +1225,11 @@ void dlsum_fmod_inv_master
 				// --fmod[lk];
 
 
-				lptr1_tmp = lloc[lb+idx_i];	
+				lptr1_tmp = lloc[lb+idx_i];
 				// luptr_tmp = lloc[lb+idx_v];
 
 				ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-				lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+				lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 				iknsupc = SuperSize( ik );
 				il = LSUM_BLK( lk );
@@ -1243,9 +1244,9 @@ void dlsum_fmod_inv_master
 
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP	
+						#ifdef _OPENMP
 							#pragma omp simd
-						#endif							
+						#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 							lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
@@ -1261,22 +1262,22 @@ void dlsum_fmod_inv_master
 
 #if ( PROFlevel>=1 )
 					TIC(t1);
-#endif		
+#endif
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP	
+						#ifdef _OPENMP
 							#pragma omp simd
-						#endif						
+						#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 							lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
 					ii = X_BLK( lk );
 					// for (jj=0;jj<num_thread;jj++)
 					RHS_ITERATE(j)
-						#ifdef _OPENMP	
-							#pragma omp simd 
-						#endif						
-						for (i = 0; i < iknsupc; ++i)	
+						#ifdef _OPENMP
+							#pragma omp simd
+						#endif
+						for (i = 0; i < iknsupc; ++i)
 							x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc ];
 
 					// fmod[lk] = -1; /* Do not solve X[k] in the future. */
@@ -1300,21 +1301,21 @@ void dlsum_fmod_inv_master
 								&alpha, Linv, &iknsupc, &x[ii],
 								&iknsupc, &beta, rtemp_loc, &iknsupc );
 #endif
-						#ifdef _OPENMP	
-							#pragma omp simd 
-						#endif	   
+						#ifdef _OPENMP
+							#pragma omp simd
+						#endif
 						for (i=0 ; i<iknsupc*nrhs ; i++){
 										x[ii+i] = rtemp_loc[i];
-						}		
+						}
 					}else{
 #ifdef _CRAY
 						STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 								lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-						dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
-								lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);		   
+						dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+								lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-						dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+						dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 								lusup1, &nsupr1, &x[ii], &iknsupc);
 #endif
 					}
@@ -1327,10 +1328,10 @@ void dlsum_fmod_inv_master
 					TOC(t2, t1);
 					stat[thread_id]->utime[SOL_TRSM] += t2;
 
-#endif	
+#endif
 
 					stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
-					
+
 #if ( DEBUGlevel>=2 )
 					printf("(%2d) Solve X[%2d]\n", iam, ik);
 #endif
@@ -1347,7 +1348,7 @@ void dlsum_fmod_inv_master
 					 */
 
 					// #ifdef _OPENMP
-					// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,send_req,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) 	
+					// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
 					// #endif
 					{
 						nlb1 = lsub1[0] - 1;
@@ -1356,11 +1357,11 @@ void dlsum_fmod_inv_master
 						dlsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
 								fmod, nlb1, xsup,
 								grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
-					}		   
+					}
 
 					// } /* if frecv[lk] == 0 */
 				} /* if iam == p */
-			} /* if fmod[lk] == 0 */				
+			} /* if fmod[lk] == 0 */
 		}
 		// }
 		stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc;
@@ -1381,16 +1382,14 @@ void dlsum_bmod_inv
  int_t  k,            /* The k-th component of X.                       */
  int_t  *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
- int_t  *Urbs2,
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
  int_t  *xsup,
  gridinfo_t *grid,
  LocalLU_t *Llu,
- MPI_Request send_req[], /* input/output */
  SuperLUStat_t **stat,
- int_t* root_send, 
- int_t* nroot_send, 
+ int_t* root_send,
+ int_t* nroot_send,
  int_t sizelsum,
  int_t sizertemp,
  int thread_id,
@@ -1414,34 +1413,34 @@ void dlsum_bmod_inv
 	int_t  *brecv = Llu->brecv;
 	int_t  **bsendx_plist = Llu->bsendx_plist;
 	BcTree  *UBtree_ptr = Llu->UBtree_ptr;
-	RdTree  *URtree_ptr = Llu->URtree_ptr;	
+	RdTree  *URtree_ptr = Llu->URtree_ptr;
 	MPI_Status status;
 	int test_flag;
 	int_t bmod_tmp;
 	int thread_id1;
 	double *rtemp_loc;
-	int_t nroot_send_tmp;	
-	double *Uinv;/* Inverse of diagonal block */    
+	int_t nroot_send_tmp;
+	double *Uinv;/* Inverse of diagonal block */
 	double temp;
 	double t1, t2;
 	float msg_vol = 0, msg_cnt = 0;
-	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;  
-	int_t iword = sizeof(int_t);	
-	int_t dword = sizeof (double);		
+	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (double);
 	int_t aln_d,aln_i;
 	aln_d = ceil(CACHELINE/(double)dword);
-	aln_i = ceil(CACHELINE/(double)iword);	
+	aln_i = ceil(CACHELINE/(double)iword);
+
 
-	
 	iam = grid->iam;
 	myrow = MYROW( iam, grid );
 	knsupc = SuperSize( k );
 	lk = LBj( k, grid ); /* Local block number, column-wise. */
-	nub = Urbs[lk];      /* Number of U blocks in block column lk */	
-	
+	nub = Urbs[lk];      /* Number of U blocks in block column lk */
+
 	if(Llu->Unnz[lk]>knsupc*64 || nub>16){
 	// if(nub>num_thread){
-	// if(nub>16){ 
+	// if(nub>16){
 	// // // // if(Urbs2[lk]>num_thread){
 	// if(Urbs2[lk]>0){
 		Nchunk=SUPERLU_MIN(num_thread,nub);
@@ -1449,15 +1448,15 @@ void dlsum_bmod_inv
 		remainder = nub % Nchunk;
 		// printf("Unnz: %5d nub: %5d knsupc: %5d\n",Llu->Unnz[lk],nub,knsupc);
 #ifdef _OPENMP
-#pragma	omp	taskloop firstprivate (send_req,stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup	
-#endif	
+#pragma	omp	taskloop firstprivate (stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup
+#endif
 		for (nn=0;nn<Nchunk;++nn){
 
-#ifdef _OPENMP				 
+#ifdef _OPENMP
 			thread_id1 = omp_get_thread_num ();
 #else
 			thread_id1 = 0;
-#endif		
+#endif
 			rtemp_loc = &rtemp[sizertemp* thread_id1];
 
 			if(nn<remainder){
@@ -1466,7 +1465,7 @@ void dlsum_bmod_inv
 			}else{
 				lbstart = remainder+nn*nub_loc;
 				lbend = remainder + (nn+1)*nub_loc;
-			}			
+			}
 			for (ub = lbstart; ub < lbend; ++ub){
 				ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
 				usub = Llu->Ufstnz_br_ptr[ik];
@@ -1481,8 +1480,8 @@ void dlsum_bmod_inv
 
 #if ( PROFlevel>=1 )
 				TIC(t1);
-#endif					
-				
+#endif
+
 				RHS_ITERATE(j) {
 					dest = &lsum[il + j*iknsupc+sizelsum*thread_id1];
 					y = &xk[j*knsupc];
@@ -1492,27 +1491,27 @@ void dlsum_bmod_inv
 						if ( fnz < iklrow ) { /* Nonzero segment. */
 							/* AXPY */
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif	
+							#pragma omp simd
+							#endif
 							for (irow = fnz; irow < iklrow; ++irow)
 								dest[irow - ikfrow] -= uval[uptr++] * y[jj];
 								stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz);
-							
+
 						}
 					} /* for jj ... */
 				}
-				
+
 #if ( PROFlevel>=1 )
 				TOC(t2, t1);
 				stat[thread_id1]->utime[SOL_GEMM] += t2;
-#endif					
-				
+#endif
+
 
 		#ifdef _OPENMP
 		#pragma omp atomic capture
-		#endif		
+		#endif
 				bmod_tmp=--bmod[ik*aln_i];
-				
+
 				if ( bmod_tmp == 0 ) { /* Local accumulation done. */
 					gikcol = PCOL( gik, grid );
 					p = PNUM( myrow, gikcol, grid );
@@ -1520,16 +1519,16 @@ void dlsum_bmod_inv
 						for (ii=1;ii<num_thread;ii++)
 							// if(ii!=thread_id1)
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif								
+							#pragma omp simd
+							#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
-								
+
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 						nroot_send_tmp = ++nroot_send[0];
-						root_send[(nroot_send_tmp-1)*aln_i] = -ik-1;						
+						root_send[(nroot_send_tmp-1)*aln_i] = -ik-1;
 						// RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],'d');
 
 		#if ( DEBUGlevel>=2 )
@@ -1537,29 +1536,29 @@ void dlsum_bmod_inv
 								iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
 		#endif
 					} else { /* Diagonal process: X[i] += lsum[i]. */
-						
+
 #if ( PROFlevel>=1 )
 						TIC(t1);
-#endif								
-						
+#endif
+
 						for (ii=1;ii<num_thread;ii++)
 							// if(ii!=thread_id1)
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif								
+							#pragma omp simd
+							#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
 						ii = X_BLK( ik );
 						dest = &x[ii];
-								
+
 						RHS_ITERATE(j)
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif							
+							#pragma omp simd
+							#endif
 							for (i = 0; i < iknsupc; ++i)
 								dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
-								
+
 						// if ( !brecv[ik] ) { /* Becomes a leaf node. */
 							// bmod[ik] = -1; /* Do not solve X[k] in the future. */
 							lk1 = LBj( gik, grid ); /* Local block number. */
@@ -1568,7 +1567,7 @@ void dlsum_bmod_inv
 							nsupr = lsub[1];
 
 							if(Llu->inv == 1){
-								Uinv = Llu->Uinv_bc_ptr[lk1];  
+								Uinv = Llu->Uinv_bc_ptr[lk1];
 		#ifdef _CRAY
 								SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
 										&alpha, Uinv, &iknsupc, &x[ii],
@@ -1583,20 +1582,20 @@ void dlsum_bmod_inv
 										&iknsupc, &beta, rtemp_loc, &iknsupc );
 		#endif
 								#ifdef _OPENMP
-								#pragma omp simd							
-								#endif			
+								#pragma omp simd
+								#endif
 								for (i=0 ; i<iknsupc*nrhs ; i++){
 									x[ii+i] = rtemp_loc[i];
-								}		
+								}
 							}else{
 		#ifdef _CRAY
 								STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
 										lusup, &nsupr, &x[ii], &iknsupc);
 		#elif defined (USE_VENDOR_BLAS)
-								dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
-										lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);	
+								dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+										lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
 		#else
-								dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+								dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 										lusup, &nsupr, &x[ii], &iknsupc);
 		#endif
 							}
@@ -1604,13 +1603,13 @@ void dlsum_bmod_inv
 								// printf("x_usum: %f\n",x[ii+i]);
 								// fflush(stdout);
 								// }
-					
+
 		#if ( PROFlevel>=1 )
 							TOC(t2, t1);
 							stat[thread_id1]->utime[SOL_TRSM] += t2;
-		#endif		
+		#endif
 							stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
-							
+
 		#if ( DEBUGlevel>=2 )
 							printf("(%2d) Solve X[%2d]\n", iam, gik);
 		#endif
@@ -1623,35 +1622,35 @@ void dlsum_bmod_inv
 								// printf("xre: %f\n",x[ii+i]);
 								// fflush(stdout);
 							// }
-							if(UBtree_ptr[lk1]!=NULL){							
+							if(UBtree_ptr[lk1]!=NULL){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 							nroot_send_tmp = ++nroot_send[0];
-							root_send[(nroot_send_tmp-1)*aln_i] = lk1;						
-							// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'d'); 
-							} 
+							root_send[(nroot_send_tmp-1)*aln_i] = lk1;
+							// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'d');
+							}
 
 							/*
 							 * Perform local block modifications.
 							 */
 							if ( Urbs[lk1] ){
 								// #ifdef _OPENMP
-								// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,Urbs2,lsum,stat,nrhs,grid,xsup) untied 
+								// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
 								// #endif
 								{
-								dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2,
+								dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 										Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-										send_req, stat, root_send, nroot_send, sizelsum,sizertemp,thread_id1,num_thread);
+										stat, root_send, nroot_send, sizelsum,sizertemp,thread_id1,num_thread);
 								}
 							}
 						// } /* if brecv[ik] == 0 */
 					}
-				} /* if bmod[ik] == 0 */				
-			}				
+				} /* if bmod[ik] == 0 */
+			}
 		}
 
-	} else { 
+	} else {
 
 		rtemp_loc = &rtemp[sizertemp* thread_id];
 
@@ -1669,7 +1668,7 @@ void dlsum_bmod_inv
 
 #if ( PROFlevel>=1 )
 		TIC(t1);
-#endif					
+#endif
 			RHS_ITERATE(j) {
 				dest = &lsum[il + j*iknsupc+sizelsum*thread_id];
 				y = &xk[j*knsupc];
@@ -1679,10 +1678,10 @@ void dlsum_bmod_inv
 					if ( fnz < iklrow ) { /* Nonzero segment. */
 						/* AXPY */
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif							
+						#pragma omp simd
+						#endif
 						for (irow = fnz; irow < iklrow; ++irow)
-						
+
 								dest[irow - ikfrow] -= uval[uptr++] * y[jj];
 								stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz);
 					}
@@ -1692,11 +1691,11 @@ void dlsum_bmod_inv
 #if ( PROFlevel>=1 )
 		TOC(t2, t1);
 		stat[thread_id]->utime[SOL_GEMM] += t2;
-#endif				
-			
+#endif
+
 	#ifdef _OPENMP
 	#pragma omp atomic capture
-	#endif		
+	#endif
 			bmod_tmp=--bmod[ik*aln_i];
 
 			if ( bmod_tmp == 0 ) { /* Local accumulation done. */
@@ -1706,15 +1705,15 @@ void dlsum_bmod_inv
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif						
-						for (jj=0;jj<iknsupc*nrhs;jj++)		
+						#pragma omp simd
+						#endif
+						for (jj=0;jj<iknsupc*nrhs;jj++)
 							lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 					nroot_send_tmp = ++nroot_send[0];
-					root_send[(nroot_send_tmp-1)*aln_i] = -ik-1;					
+					root_send[(nroot_send_tmp-1)*aln_i] = -ik-1;
 					// RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],'d');
 
 	#if ( DEBUGlevel>=2 )
@@ -1722,29 +1721,29 @@ void dlsum_bmod_inv
 							iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
 	#endif
 				} else { /* Diagonal process: X[i] += lsum[i]. */
-					
+
 #if ( PROFlevel>=1 )
 					TIC(t1);
-#endif							
-					
+#endif
+
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif						
+						#pragma omp simd
+						#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
 					ii = X_BLK( ik );
 					dest = &x[ii];
-							
+
 					RHS_ITERATE(j)
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif					
+						#pragma omp simd
+						#endif
 						for (i = 0; i < iknsupc; ++i)
 							dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
-					
+
 					// if ( !brecv[ik] ) { /* Becomes a leaf node. */
 						// bmod[ik] = -1; /* Do not solve X[k] in the future. */
 						lk1 = LBj( gik, grid ); /* Local block number. */
@@ -1753,7 +1752,7 @@ void dlsum_bmod_inv
 						nsupr = lsub[1];
 
 						if(Llu->inv == 1){
-							Uinv = Llu->Uinv_bc_ptr[lk1];  
+							Uinv = Llu->Uinv_bc_ptr[lk1];
 	#ifdef _CRAY
 							SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
 									&alpha, Uinv, &iknsupc, &x[ii],
@@ -1766,30 +1765,30 @@ void dlsum_bmod_inv
 							dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
 									&alpha, Uinv, &iknsupc, &x[ii],
 									&iknsupc, &beta, rtemp_loc, &iknsupc );
-	#endif	
+	#endif
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif	
+							#pragma omp simd
+							#endif
 							for (i=0 ; i<iknsupc*nrhs ; i++){
 								x[ii+i] = rtemp_loc[i];
-							}		
+							}
 						}else{
 	#ifdef _CRAY
 							STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
 									lusup, &nsupr, &x[ii], &iknsupc);
 	#elif defined (USE_VENDOR_BLAS)
-							dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
-									lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);	
+							dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+									lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
 	#else
-							dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+							dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 									lusup, &nsupr, &x[ii], &iknsupc);
 	#endif
 						}
-				
+
 	#if ( PROFlevel>=1 )
 						TOC(t2, t1);
 						stat[thread_id]->utime[SOL_TRSM] += t2;
-	#endif	
+	#endif
 						stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
 	#if ( DEBUGlevel>=2 )
 						printf("(%2d) Solve X[%2d]\n", iam, gik);
@@ -1808,28 +1807,28 @@ void dlsum_bmod_inv
 #pragma omp atomic capture
 #endif
 						nroot_send_tmp = ++nroot_send[0];
-						root_send[(nroot_send_tmp-1)*aln_i] = lk1;						
-						// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'d'); 
-						} 
+						root_send[(nroot_send_tmp-1)*aln_i] = lk1;
+						// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'d');
+						}
 
 						/*
 						 * Perform local block modifications.
 						 */
 						if ( Urbs[lk1] )
-						
+
 							// if(Urbs[lk1]>16){
 							// #ifdef _OPENMP
-							// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,Urbs2,lsum,stat,nrhs,grid,xsup) untied 
-							// #endif						
-							// 	dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2,
+							// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+							// #endif
+							// 	dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 									//	Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-									//	send_req, stat, root_send, nroot_send, sizelsum,sizertemp);
+									//	stat, root_send, nroot_send, sizelsum,sizertemp);
 							//}else{
-								dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2,
+								dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 										Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-										send_req, stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread);					
-							//}		
-									
+										stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread);
+							//}
+
 					// } /* if brecv[ik] == 0 */
 				}
 			} /* if bmod[ik] == 0 */
@@ -1853,13 +1852,11 @@ void dlsum_bmod_inv_master
  int_t  k,            /* The k-th component of X.                       */
  int_t  *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
- int_t  *Urbs2,
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
  int_t  *xsup,
  gridinfo_t *grid,
  LocalLU_t *Llu,
- MPI_Request send_req[], /* input/output */
  SuperLUStat_t **stat,
  int_t sizelsum,
  int_t sizertemp,
@@ -1884,39 +1881,37 @@ void dlsum_bmod_inv_master
 	int_t  *brecv = Llu->brecv;
 	int_t  **bsendx_plist = Llu->bsendx_plist;
 	BcTree  *UBtree_ptr = Llu->UBtree_ptr;
-	RdTree  *URtree_ptr = Llu->URtree_ptr;	
+	RdTree  *URtree_ptr = Llu->URtree_ptr;
 	MPI_Status status;
 	int test_flag;
 	int_t bmod_tmp;
 	int thread_id1;
 	double *rtemp_loc;
-	double temp;	
-	double *Uinv;/* Inverse of diagonal block */    
+	double temp;
+	double *Uinv;/* Inverse of diagonal block */
 
 	double t1, t2;
 	float msg_vol = 0, msg_cnt = 0;
-	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; 
-	int_t iword = sizeof(int_t);	
-	int_t dword = sizeof (double);		
+	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (double);
 	int_t aln_d,aln_i;
 	aln_d = ceil(CACHELINE/(double)dword);
 	aln_i = ceil(CACHELINE/(double)iword);
-		
-	
+
+
 	rtemp_loc = &rtemp[sizertemp* thread_id];
-	
-	
+
+
 	iam = grid->iam;
 	myrow = MYROW( iam, grid );
 	knsupc = SuperSize( k );
 	lk = LBj( k, grid ); /* Local block number, column-wise. */
-	nub = Urbs[lk];      /* Number of U blocks in block column lk */	
+	nub = Urbs[lk];      /* Number of U blocks in block column lk */
 
-	
-	 
 	// printf("Urbs2[lk] %5d lk %5d nub %5d\n",Urbs2[lk],lk,nub);
 	// fflush(stdout);
-	
+
 	if(nub>num_thread){
 	// if(nub>0){
 		Nchunk=num_thread;
@@ -1924,28 +1919,28 @@ void dlsum_bmod_inv_master
 		remainder = nub % Nchunk;
 
 //#ifdef _OPENMP
-//#pragma	omp	taskloop firstprivate (send_req,stat) private (thread_id1,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied	
-//#endif	
+//#pragma	omp	taskloop firstprivate (stat) private (thread_id1,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied
+//#endif
 		for (nn=0;nn<Nchunk;++nn){
 
-#ifdef _OPENMP				 
+#ifdef _OPENMP
 			thread_id1 = omp_get_thread_num ();
 #else
 			thread_id1 = 0;
-#endif		
+#endif
 			rtemp_loc = &rtemp[sizertemp* thread_id1];
 
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif				
-			
+#endif
+
 			if(nn<remainder){
 				lbstart = nn*(nub_loc+1);
 				lbend = (nn+1)*(nub_loc+1);
 			}else{
 				lbstart = remainder+nn*nub_loc;
 				lbend = remainder + (nn+1)*nub_loc;
-			}			
+			}
 			for (ub = lbstart; ub < lbend; ++ub){
 				ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
 				usub = Llu->Ufstnz_br_ptr[ik];
@@ -1956,8 +1951,8 @@ void dlsum_bmod_inv_master
 				gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
 				iknsupc = SuperSize( gik );
 				ikfrow = FstBlockC( gik );
-				iklrow = FstBlockC( gik+1 );				
-				
+				iklrow = FstBlockC( gik+1 );
+
 				RHS_ITERATE(j) {
 					dest = &lsum[il + j*iknsupc+sizelsum*thread_id1];
 					y = &xk[j*knsupc];
@@ -1967,12 +1962,12 @@ void dlsum_bmod_inv_master
 						if ( fnz < iklrow ) { /* Nonzero segment. */
 							/* AXPY */
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif							
+							#pragma omp simd
+							#endif
 							for (irow = fnz; irow < iklrow; ++irow)
 								dest[irow - ikfrow] -= uval[uptr++] * y[jj];
 							stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz);
-							
+
 						}
 					} /* for jj ... */
 				}
@@ -1980,14 +1975,14 @@ void dlsum_bmod_inv_master
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat[thread_id1]->utime[SOL_GEMM] += t2;
-#endif	
+#endif
 		}
-				
-	}else{	
+
+	}else{
 		rtemp_loc = &rtemp[sizertemp* thread_id];
 #if ( PROFlevel>=1 )
 		TIC(t1);
-#endif	
+#endif
 		for (ub = 0; ub < nub; ++ub) {
 			ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
 			usub = Llu->Ufstnz_br_ptr[ik];
@@ -1999,7 +1994,7 @@ void dlsum_bmod_inv_master
 			iknsupc = SuperSize( gik );
 			ikfrow = FstBlockC( gik );
 			iklrow = FstBlockC( gik+1 );
-				
+
 			RHS_ITERATE(j) {
 				dest = &lsum[il + j*iknsupc+sizelsum*thread_id];
 				y = &xk[j*knsupc];
@@ -2009,24 +2004,24 @@ void dlsum_bmod_inv_master
 					if ( fnz < iklrow ) { /* Nonzero segment. */
 						/* AXPY */
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif						
+						#pragma omp simd
+						#endif
 						for (irow = fnz; irow < iklrow; ++irow)
 							dest[irow - ikfrow] -= uval[uptr++] * y[jj];
 						stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz);
-						
+
 					}
 				} /* for jj ... */
-			}			
-		}	
+			}
+		}
 #if ( PROFlevel>=1 )
 		TOC(t2, t1);
 		stat[thread_id]->utime[SOL_GEMM] += t2;
-#endif				
+#endif
 	}
 
-	
-	rtemp_loc = &rtemp[sizertemp* thread_id];	
+
+	rtemp_loc = &rtemp[sizertemp* thread_id];
 	for (ub = 0; ub < nub; ++ub){
 		ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
 		il = LSUM_BLK( ik );
@@ -2035,9 +2030,9 @@ void dlsum_bmod_inv_master
 
 	// #ifdef _OPENMP
 	// #pragma omp atomic capture
-	// #endif		
+	// #endif
 		bmod_tmp=--bmod[ik*aln_i];
-		
+
 		if ( bmod_tmp == 0 ) { /* Local accumulation done. */
 			gikcol = PCOL( gik, grid );
 			p = PNUM( myrow, gikcol, grid );
@@ -2045,8 +2040,8 @@ void dlsum_bmod_inv_master
 				for (ii=1;ii<num_thread;ii++)
 					// if(ii!=thread_id)
 					#ifdef _OPENMP
-					#pragma omp simd							
-					#endif					
+					#pragma omp simd
+					#endif
 					for (jj=0;jj<iknsupc*nrhs;jj++)
 						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 				RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[ik],'d')*nrhs+LSUM_H,'d');
@@ -2056,28 +2051,28 @@ void dlsum_bmod_inv_master
 						iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
 #endif
 			} else { /* Diagonal process: X[i] += lsum[i]. */
-				
+
 #if ( PROFlevel>=1 )
 				TIC(t1);
-#endif								
+#endif
 				for (ii=1;ii<num_thread;ii++)
 					// if(ii!=thread_id)
 					#ifdef _OPENMP
-					#pragma omp simd							
-					#endif							
+					#pragma omp simd
+					#endif
 					for (jj=0;jj<iknsupc*nrhs;jj++)
 						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
 				ii = X_BLK( ik );
 				dest = &x[ii];
-						
+
 				RHS_ITERATE(j)
 					#ifdef _OPENMP
-					#pragma omp simd							
-					#endif						
+					#pragma omp simd
+					#endif
 					for (i = 0; i < iknsupc; ++i)
 						dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
-						
+
 				// if ( !brecv[ik] ) { /* Becomes a leaf node. */
 					// bmod[ik] = -1; /* Do not solve X[k] in the future. */
 					lk1 = LBj( gik, grid ); /* Local block number. */
@@ -2086,7 +2081,7 @@ void dlsum_bmod_inv_master
 					nsupr = lsub[1];
 
 					if(Llu->inv == 1){
-						Uinv = Llu->Uinv_bc_ptr[lk1];  
+						Uinv = Llu->Uinv_bc_ptr[lk1];
 #ifdef _CRAY
 						SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
 								&alpha, Uinv, &iknsupc, &x[ii],
@@ -2099,30 +2094,30 @@ void dlsum_bmod_inv_master
 						dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
 								&alpha, Uinv, &iknsupc, &x[ii],
 								&iknsupc, &beta, rtemp_loc, &iknsupc );
-#endif	
+#endif
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif		   
+						#pragma omp simd
+						#endif
 						for (i=0 ; i<iknsupc*nrhs ; i++){
 							x[ii+i] = rtemp_loc[i];
-						}		
+						}
 					}else{
 #ifdef _CRAY
 						STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
 								lusup, &nsupr, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-						dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
-								lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);	
+						dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+								lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-						dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+						dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 								lusup, &nsupr, &x[ii], &iknsupc);
 #endif
 					}
-			
+
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
 					stat[thread_id]->utime[SOL_TRSM] += t2;
-#endif					
+#endif
 					stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
 #if ( DEBUGlevel>=2 )
 					printf("(%2d) Solve X[%2d]\n", iam, gik);
@@ -2137,25 +2132,25 @@ void dlsum_bmod_inv_master
 						// fflush(stdout);
 					// }
 					if(UBtree_ptr[lk1]!=NULL){
-					BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'d')*nrhs+XK_H,'d'); 
-					} 
+					BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'d')*nrhs+XK_H,'d');
+					}
 
 					/*
 					 * Perform local block modifications.
 					 */
 					if ( Urbs[lk1] ){
 						// #ifdef _OPENMP
-						// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,Urbs2,lsum,stat,nrhs,grid,xsup) untied 
+						// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
 						// #endif
 						{
-						dlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2,
+						dlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 								Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-								send_req, stat, sizelsum,sizertemp,thread_id,num_thread);
+								stat, sizelsum,sizertemp,thread_id,num_thread);
 						}
 					}
 				// } /* if brecv[ik] == 0 */
 			}
-		} /* if bmod[ik] == 0 */		
-	}	
-	
+		} /* if bmod[ik] == 0 */
+	}
+
 } /* dlsum_bmod_inv_master */
diff -pruN 6.1.0+dfsg1-1/SRC/pdlangs.c 6.1.1+dfsg1-1/SRC/pdlangs.c
--- 6.1.0+dfsg1-1/SRC/pdlangs.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdlangs.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Returns the value of the one norm, or the Frobenius norm, or the infinity norm, or the element of largest value
  *
  * <pre>
@@ -23,47 +23,47 @@ at the top-level directory.
 
 /*! \brief
 
-<pre> 
-    Purpose   
-    =======   
-
-    PDLANGS returns the value of the one norm, or the Frobenius norm, or 
-    the infinity norm, or the element of largest absolute value of a 
-    real matrix A.   
-
-    Description   
-    ===========   
-
-    PDLANGE returns the value   
-
-       PDLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
-                 (   
-                 ( norm1(A),         NORM = '1', 'O' or 'o'   
-                 (   
-                 ( normI(A),         NORM = 'I' or 'i'   
-                 (   
-                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
-
-    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
-    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
-    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
-    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+<pre>
+    Purpose
+    =======
+
+    PDLANGS returns the value of the one norm, or the Frobenius norm, or
+    the infinity norm, or the element of largest absolute value of a
+    real matrix A.
+
+    Description
+    ===========
+
+    PDLANGE returns the value
+
+       PDLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                 (
+                 ( norm1(A),         NORM = '1', 'O' or 'o'
+                 (
+                 ( normI(A),         NORM = 'I' or 'i'
+                 (
+                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.
 
-    Arguments   
-    =========   
+    Arguments
+    =========
 
-    NORM    (input) CHARACTER*1   
-            Specifies the value to be returned in DLANGE as described above.   
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in DLANGE as described above.
     A       (input) SuperMatrix*
-            The M by N sparse matrix A. 
+            The M by N sparse matrix A.
     GRID    (input) gridinof_t*
             The 2D process mesh.
-   ===================================================================== 
+   =====================================================================
 </pre>
 */
 
 double pdlangs(char *norm, SuperMatrix *A, gridinfo_t *grid)
-{   
+{
     /* Local variables */
     NRformat_loc *Astore;
     int_t    m_loc;
@@ -77,7 +77,7 @@ double pdlangs(char *norm, SuperMatrix *
     Astore = (NRformat_loc *) A->Store;
     m_loc = Astore->m_loc;
     Aval   = (double *) Astore->nzval;
-    
+
     if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
 	value = 0.;
     } else if ( strncmp(norm, "M", 1)==0 ) {
@@ -97,7 +97,7 @@ double pdlangs(char *norm, SuperMatrix *
 #if 0
 	for (j = 0; j < A->ncol; ++j) {
 	    sum = 0.;
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
 		sum += fabs(Aval[i]);
 	    value = SUPERLU_MAX(value,sum);
 	}
@@ -120,7 +120,7 @@ double pdlangs(char *norm, SuperMatrix *
 	}
 	SUPERLU_FREE (temprwork);
 	SUPERLU_FREE (rwork);
-#endif	
+#endif
     } else if ( strncmp(norm, "I", 1)==0 ) {
 	/* Find normI(A). */
 	value = 0.;
@@ -139,7 +139,7 @@ double pdlangs(char *norm, SuperMatrix *
     } else {
 	ABORT("Illegal norm specified.");
     }
-    
+
     return (value);
 
 } /* pdlangs */
diff -pruN 6.1.0+dfsg1-1/SRC/pdlaqgs.c 6.1.1+dfsg1-1/SRC/pdlaqgs.c
--- 6.1.0+dfsg1-1/SRC/pdlaqgs.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdlaqgs.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Equilibrates a general sparse M by N matrix
  *
  * <pre>
@@ -24,70 +24,70 @@ at the top-level directory.
 /*! \brief
 
 <pre>
-    Purpose   
-    =======   
+    Purpose
+    =======
 
     PDLAQGS equilibrates a general sparse M by N matrix A using the row
-    and column scaling factors in the vectors R and C.   
+    and column scaling factors in the vectors R and C.
 
     See supermatrix.h for the definition of 'SuperMatrix' structure.
 
-    Arguments   
-    =========   
+    Arguments
+    =========
 
     A       (input/output) SuperMatrix*
-            On exit, the equilibrated matrix.  See EQUED for the form of 
+            On exit, the equilibrated matrix.  See EQUED for the form of
             the equilibrated matrix. The type of A can be:
 	    Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
-	    
+
     R       (input) double*, dimension (A->nrow)
             The row scale factors for A.
-	    
+
     C       (input) double*, dimension (A->ncol)
             The column scale factors for A.
-	    
+
     ROWCND  (input) double
             Ratio of the smallest R(i) to the largest R(i).
-	    
+
     COLCND  (input) double
             Ratio of the smallest C(i) to the largest C(i).
-	    
+
     AMAX    (input) double
             Absolute value of largest matrix entry.
-	    
+
     EQUED   (output) char*
-            Specifies the form of equilibration that was done.   
-            = 'N':  No equilibration   
-            = 'R':  Row equilibration, i.e., A has been premultiplied by  
-                    diag(R).   
-            = 'C':  Column equilibration, i.e., A has been postmultiplied  
-                    by diag(C).   
+            Specifies the form of equilibration that was done.
+            = 'N':  No equilibration
+            = 'R':  Row equilibration, i.e., A has been premultiplied by
+                    diag(R).
+            = 'C':  Column equilibration, i.e., A has been postmultiplied
+                    by diag(C).
             = 'B':  Both row and column equilibration, i.e., A has been
-                    replaced by diag(R) * A * diag(C).   
+                    replaced by diag(R) * A * diag(C).
 
-    Internal Parameters   
-    ===================   
+    Internal Parameters
+    ===================
 
-    THRESH is a threshold value used to decide if row or column scaling   
-    should be done based on the ratio of the row or column scaling   
-    factors.  If ROWCND < THRESH, row scaling is done, and if   
-    COLCND < THRESH, column scaling is done.   
+    THRESH is a threshold value used to decide if row or column scaling
+    should be done based on the ratio of the row or column scaling
+    factors.  If ROWCND < THRESH, row scaling is done, and if
+    COLCND < THRESH, column scaling is done.
 
-    LARGE and SMALL are threshold values used to decide if row scaling   
-    should be done based on the absolute size of the largest matrix   
-    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+    LARGE and SMALL are threshold values used to decide if row scaling
+    should be done based on the absolute size of the largest matrix
+    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.
 
-    ===================================================================== 
+    =====================================================================
 </pre>
 */
 
 void
-pdlaqgs(SuperMatrix *A, double *r, double *c, 
+pdlaqgs(SuperMatrix *A, double *r, double *c,
        double rowcnd, double colcnd, double amax, char *equed)
 {
 
 #define THRESH    (0.1)
-    
+
     /* Local variables */
     NRformat_loc *Astore;
     double *Aval;
@@ -103,7 +103,7 @@ pdlaqgs(SuperMatrix *A, double *r, doubl
     Astore = A->Store;
     Aval = Astore->nzval;
     m_loc = Astore->m_loc;
-    
+
     /* Initialize LARGE and SMALL. */
     small = dmach_dist("Safe minimum") / dmach_dist("Precision");
     large = 1. / small;
diff -pruN 6.1.0+dfsg1-1/SRC/pdsymbfact_distdata.c 6.1.1+dfsg1-1/SRC/pdsymbfact_distdata.c
--- 6.1.0+dfsg1-1/SRC/pdsymbfact_distdata.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdsymbfact_distdata.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,21 +1,21 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Redistribute the symbolic structure of L and U from the distribution
  *
  * <pre>
  * -- Parallel symbolic factorization auxialiary routine (version 2.3) --
- * -- Distributes the data from parallel symbolic factorization 
+ * -- Distributes the data from parallel symbolic factorization
  * -- to numeric factorization
  * INRIA France -  July 1, 2004
  * Laura Grigori
@@ -38,12 +38,12 @@ at the top-level directory.
  * <pre>
  * Purpose
  * =======
- * 
+ *
  * Redistribute the symbolic structure of L and U from the distribution
  * used in the parallel symbolic factorization step to the distdibution
  * used in the parallel numeric factorization step.  On exit, the L and U
  * structure for the 2D distribution used in the numeric factorization step is
- * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal 
+ * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal
  * information is also computed and it is stored in Glu_persist->supno
  * and Glu_persist->xsup.
  *
@@ -52,11 +52,11 @@ at the top-level directory.
  * p_xlsub, p_lsub, p_xusub, p_usub,
  * Glu_persist->supno,  Glu_persist->xsup.
  *
- * This routine also deallocates memory allocated during symbolic 
+ * This routine also deallocates memory allocated during symbolic
  * factorization routine.  That is, the folloing arrays are freed:
- * Pslu_freeable->xlsub,  Pslu_freeable->lsub, 
- * Pslu_freeable->xusub, Pslu_freeable->usub, 
- * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, 
+ * Pslu_freeable->xlsub,  Pslu_freeable->lsub,
+ * Pslu_freeable->xusub, Pslu_freeable->usub,
+ * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc,
  * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
  *
  * Arguments
@@ -65,28 +65,28 @@ at the top-level directory.
  * n      (Input) int_t
  *        Order of the input matrix
  * Pslu_freeable  (Input) Pslu_freeable_t *
- *        Local L and U structure, 
+ *        Local L and U structure,
  *        global to local indexing information.
- * 
+ *
  * Glu_persist (Output) Glu_persist_t *
  *        Stores on output the information on supernodes mapping.
- * 
+ *
  * p_xlsub (Output) int_t **
- *         Pointer to structure of L distributed on a 2D grid 
+ *         Pointer to structure of L distributed on a 2D grid
  *         of processors, stored by columns.
- * 
+ *
  * p_lsub  (Output) int_t **
- *         Structure of L distributed on a 2D grid of processors, 
+ *         Structure of L distributed on a 2D grid of processors,
  *         stored by columns.
  *
  * p_xusub (Output) int_t **
- *         Pointer to structure of U distributed on a 2D grid 
+ *         Pointer to structure of U distributed on a 2D grid
  *         of processors, stored by rows.
- * 
+ *
  * p_usub  (Output) int_t **
- *         Structure of U distributed on a 2D grid of processors, 
+ *         Structure of U distributed on a 2D grid of processors,
  *         stored by rows.
- * 
+ *
  * grid   (Input) gridinfo_t*
  *        The 2D process mesh.
  *
@@ -99,14 +99,14 @@ at the top-level directory.
  */
 
 static float
-dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, 
-	     Glu_persist_t *Glu_persist, 
+dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable,
+	     Glu_persist_t *Glu_persist,
 	     int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
 	     gridinfo_t *grid
 	     )
 {
   int   iam, nprocs, pc, pr, p, np, p_diag;
-  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, 
+  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u,
     *tmp_ptrToSend, *mem;
   int_t *nnzToRecv_l, *nnzToRecv_u;
   int_t *send_1, *send_2, nsend_1, nsend_2;
@@ -126,7 +126,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   float memAux;  /* Memory used during this routine and freed on return */
   float memRet; /* Memory allocated and not freed on return */
   int_t iword, dword;
-  
+
   /* ------------------------------------------------------------
      INITIALIZATION.
      ------------------------------------------------------------*/
@@ -147,7 +147,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   iword = sizeof(int_t);
   dword = sizeof(double);
   memAux = 0.; memRet = 0.;
-  
+
   mem           = intCalloc_dist(12 * nprocs);
   if (!mem)
     return (ERROR_RET);
@@ -161,7 +161,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   tmp_ptrToSend = send_2 + nprocs;
   nnzToRecv_l   = tmp_ptrToSend + nprocs;
   nnzToRecv_u   = nnzToRecv_l + nprocs;
-  
+
   ptrToSend = nnzToSend;
   ptrToRecv = nnzToSend + nprocs;
 
@@ -173,7 +173,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   memAux += 5 * nprocs * sizeof(int);
 
   maxszsn   = sp_ienv_dist(3);
-  
+
   /* Allocate space for storing Glu_persist_n. */
   if ( !(supno_n = intMalloc_dist(n+1)) ) {
     fprintf (stderr, "Malloc fails for supno_n[].");
@@ -184,7 +184,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   /* ------------------------------------------------------------
      DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION
      ------------------------------------------------------------*/
-  
+
   if (nvtcs_loc > INT_MAX)
     ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n");
   intNvtcs_loc = (int) nvtcs_loc;
@@ -199,7 +199,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       k += nvtcs[p];
     }
   }
-  
+
   if (nprocs > 1) {
     temp = NULL;
     if (!iam ) {
@@ -218,7 +218,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 #else  /* Default */
     intBuf1 = ptrToRecv;
 #endif
-    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, 
+    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t,
 		 temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm);
   }
   else
@@ -255,7 +255,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   /* reset to 0 nnzToSend */
   for (p = 0; p < 2 *nprocs; p++)
     nnzToSend[p] = 0;
-  
+
   MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm);
   nsupers = supno_n[n];
   /* Allocate space for storing Glu_persist_n. */
@@ -263,7 +263,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
     fprintf (stderr, "Malloc fails for xsup_n[].");
     return (memAux + memRet);
   }
-  memRet += (float) (nsupers+1) * iword;  
+  memRet += (float) (nsupers+1) * iword;
 
   /* ------------------------------------------------------------
      COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
@@ -279,7 +279,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
     }
   }
   xsup_n[nsupers] = n;
-  
+
   for (p = 0; p < nprocs; p++) {
     send_1[p] = FALSE;
     send_2[p] = FALSE;
@@ -290,7 +290,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       pc = PCOL( gb_n, grid );
       pr = PROW( gb_n, grid );
       p_diag = PNUM( pr, pc, grid);
-      
+
       i_loc = LOCAL_IND( globToLoc[i] );
       gb_s  = supno_s[i_loc];
       fst_s = xsup_beg_s[gb_s];
@@ -310,17 +310,17 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) {
 	  gb = supno_n[k];
 	  p = PNUM( pr, PCOL(gb, grid), grid);
-	  nnzToSend[2*p+1] ++;	
+	  nnzToSend[2*p+1] ++;
 	  send_2[p] = TRUE;
 	}
       }
-      
+
       nsend_2 = 0;
       for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
 	nnzToSend[2*p+1] += 2;
-	if (send_2[p])  nsend_2 ++;	  
+	if (send_2[p])  nsend_2 ++;
       }
-      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) 
+      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++)
 	if (send_2[p] || p == p_diag) {
 	  if (p == p_diag && !send_2[p])
 	    nnzToSend[2*p+1] += nsend_2;
@@ -333,7 +333,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	nnzToSend[2*p] += 2;
 	if (send_1[p]) nsend_1 ++;
       }
-      for (p = pc; p < nprocs; p += grid->npcol) 
+      for (p = pc; p < nprocs; p += grid->npcol)
 	if (send_1[p]) {
 	  nnzToSend[2*p] += nsend_1-1;
 	  send_1[p] = FALSE;
@@ -342,28 +342,28 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	  nnzToSend[2*p] += nsend_1;
     }
   }
-  
+
   /* All-to-all communication */
   MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t,
 		grid->comm);
-  
+
   nnz_loc_l = nnz_loc_u = 0;
-  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;  
+  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;
   for (p = 0; p < nprocs; p++) {
     if ( p != iam ) {
       SendCnt_l += nnzToSend[2*p];   nnzToSend_l[p] = nnzToSend[2*p];
-      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; 
+      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1];
       RecvCnt_l += nnzToRecv[2*p];   nnzToRecv_l[p] = nnzToRecv[2*p];
       RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1];
     } else {
       nnz_loc_l += nnzToRecv[2*p];
       nnz_loc_u += nnzToRecv[2*p+1];
       nnzToSend_l[p] = 0; nnzToSend_u[p] = 0;
-      nnzToRecv_l[p] = nnzToRecv[2*p]; 
+      nnzToRecv_l[p] = nnzToRecv[2*p];
       nnzToRecv_u[p] = nnzToRecv[2*p+1];
     }
   }
-  
+
   /* Allocate space for storing the symbolic structure after redistribution. */
   nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
   nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
@@ -377,16 +377,16 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
     fprintf (stderr, "Malloc fails for xusub_n[].");
     return (memAux + memRet);
   }
-  memRet += (float) (nsupers_i+1) * iword;  
+  memRet += (float) (nsupers_i+1) * iword;
 
   /* Allocate temp storage for sending/receiving the L/U symbolic structure. */
   if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) {
-    if (!(rcv_luind = 
+    if (!(rcv_luind =
 	  intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) {
       fprintf (stderr, "Malloc fails for rcv_luind[].");
       return (memAux + memRet);
     }
-    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) 
+    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u)
       * iword;
   }
   if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
@@ -395,8 +395,8 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       return (memAux + memRet);
     }
     memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
-  } 
-  
+  }
+
   /* ------------------------------------------------------------------
      LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND.
      THIS ACCOUNTS FOR THE SECOND PASS OF L and U.
@@ -419,16 +419,16 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       ptrToRecv[p] = j;  j += nnzToRecv[p];
     }
     nnzToRecv[iam] = 0;
-    
+
     ind_loc = ptrToRecv[iam];
     for (gb_n = 0; gb_n < nsupers; gb_n++) {
-      nsend_2 = 0;    
+      nsend_2 = 0;
       i = xsup_n[gb_n];
       if (iam == OWNER( globToLoc[i] )) {
 	pc = PCOL( gb_n, grid );
 	pr = PROW( gb_n, grid );
 	p_diag = PNUM( pr, pc, grid );
-	
+
 	i_loc = LOCAL_IND( globToLoc[i] );
 	gb_s  = supno_s[i_loc];
 	fst_s = xsup_beg_s[gb_s];
@@ -436,7 +436,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	fst_s_l = LOCAL_IND( globToLoc[fst_s] );
 
 	if (sendL) {
-	  p = pc;                np = grid->nprow;	  
+	  p = pc;                np = grid->nprow;
 	} else {
 	  p = pr * grid->npcol;  np = grid->npcol;
 	}
@@ -445,13 +445,13 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	    rcv_luind[ind_loc] = gb_n;
 	    rcv_luind[ind_loc+1] = 0;
 	    tmp_ptrToSend[p] = ind_loc + 1;
-	    ind_loc += 2;	 
+	    ind_loc += 2;
 	  }
 	  else {
 	    snd_luind[ptrToSend[p]] = gb_n;
 	    snd_luind[ptrToSend[p]+1] = 0;
 	    tmp_ptrToSend[p] = ptrToSend[p] + 1;
-	    ptrToSend[p] += 2;	 
+	    ptrToSend[p] += 2;
 	  }
 	  if (sendL) p += grid->npcol;
 	  if (sendU) p++;
@@ -462,7 +462,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	    gb = supno_n[k];
 	    if (sendL)
 	      p = PNUM( PROW(gb, grid), pc, grid );
-	    else 
+	    else
 	      p = PNUM( pr, PCOL(gb, grid), grid);
 	    if (send_1[p] == FALSE) {
 	      send_1[p] = TRUE;
@@ -497,10 +497,10 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 		}
 	      }
 	      send_1[p] = FALSE;
-	  }  
+	  }
 	if (sendU)
 	  for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
-	    if (send_1[p] || p == p_diag) {	      
+	    if (send_1[p] || p == p_diag) {
 	      for (k = 0; k < nsend_2; k++) {
 		gb = supno_n[send_2[k]];
 		if(PNUM( pr, PCOL(gb, grid), grid) != p) {
@@ -511,15 +511,15 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 		  else {
 		    snd_luind[ptrToSend[p]] = send_2[k];
 		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-		  }	     
+		  }
 		}
-	      } 
+	      }
 	      send_1[p] = FALSE;
 	    }
 	  }
       }
     }
-    
+
     /* reset ptrToSnd to point to the beginning of the data for
        each processor (structure needed in MPI_Alltoallv) */
     for (i = 0, p = 0; p < nprocs; p++) {
@@ -547,24 +547,24 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       intBuf3 = nnzToRecv;  intBuf4 = ptrToRecv;
 #endif
 
-      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, 
+      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t,
 		     rcv_luind, intBuf3, intBuf4, mpi_int_t,
 		     grid->comm);
     }
     if (sendL)
       nnzToRecv[iam] = nnz_loc_l;
-    else 
+    else
       nnzToRecv[iam] = nnz_loc_u;
-    
+
     /* ------------------------------------------------------------
        DEALLOCATE TEMPORARY STORAGE.
        -------------------------------------------------------------*/
-    if (sendU) 
+    if (sendU)
       if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
 	SUPERLU_FREE (snd_luind);
 	memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
       }
-    
+
     /* ------------------------------------------------------------
        CONVERT THE FORMAT.
        ------------------------------------------------------------*/
@@ -588,9 +588,9 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 
     if (sendL) j = nsupers_j;
     else j = nsupers_i;
-    k = 0; 
+    k = 0;
     isize = xsub_n[0];
-    xsub_n[0] = 0; 
+    xsub_n[0] = 0;
     for (gb_l = 1; gb_l < j; gb_l++) {
       k += isize;
       isize = xsub_n[gb_l];
@@ -620,7 +620,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       }
       sub_n = usub_n;
     }
-    
+
     /* Copy the data into the L column / U row oriented storage */
     k = 0;
     for (p = 0; p < nprocs; p++) {
@@ -636,7 +636,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) {
 	  sub_n[j] = rcv_luind[i];
 	}
-      }      
+      }
       k += nnzToRecv[p];
     }
     if (sendL) {
@@ -651,23 +651,23 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
     SUPERLU_FREE (rcv_luind);
     memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword;
   }
-  SUPERLU_FREE (mem);  
+  SUPERLU_FREE (mem);
   memAux -= (float) (12 * nprocs * iword);
   SUPERLU_FREE(nvtcs);
   memAux -= (float) (5 * nprocs * sizeof(int));
-  
+
   if (xlsub_s != NULL) {
     SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s);
   }
   if (xusub_s != NULL) {
     SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s);
   }
-  SUPERLU_FREE (globToLoc); 
+  SUPERLU_FREE (globToLoc);
   if (supno_s != NULL) {
     SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s);
     SUPERLU_FREE (supno_s);
   }
-  
+
   Glu_persist->supno = supno_n;  Glu_persist->xsup  = xsup_n;
   *p_xlsub = xlsub_n; *p_lsub = lsub_n;
   *p_xusub = xusub_n; *p_usub = usub_n;
@@ -675,10 +675,10 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 #if ( DEBUGlevel>=1 )
   CHECK_MALLOC(iam, "Exit dist_symbLU()");
 #endif
-  
+
   return (-memRet);
 }
- 
+
 /*! \brief
  *
  * <pre>
@@ -687,10 +687,10 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
  *   Re-distribute A on the 2D process mesh.  The lower part is
  *   stored using a column format and the upper part
  *   is stored using a row format.
- * 
+ *
  * Arguments
  * =========
- * 
+ *
  * A      (Input) SuperMatrix*
  *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
  *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
@@ -701,40 +701,40 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
  *
  * Glu_persist  (Input) Glu_persist_t *
  *        Information on supernodes mapping.
- * 
+ *
  * grid   (Input) gridinfo_t*
  *        The 2D process mesh.
  *
  * p_ainf_colptr (Output) int_t**
- *         Pointer to the lower part of A distributed on a 2D grid 
+ *         Pointer to the lower part of A distributed on a 2D grid
  *         of processors, stored by columns.
  *
  * p_ainf_rowind (Output) int_t**
- *         Structure of of the lower part of A distributed on a 
+ *         Structure of of the lower part of A distributed on a
  *         2D grid of processors, stored by columns.
  *
  * p_ainf_val    (Output) double**
- *         Numerical values of the lower part of A, distributed on a 
+ *         Numerical values of the lower part of A, distributed on a
  *         2D grid of processors, stored by columns.
  *
  * p_asup_rowptr (Output) int_t**
- *         Pointer to the upper part of A distributed on a 2D grid 
+ *         Pointer to the upper part of A distributed on a 2D grid
  *         of processors, stored by rows.
  *
  * p_asup_colind (Output) int_t**
- *         Structure of of the upper part of A distributed on a 
+ *         Structure of of the upper part of A distributed on a
  *         2D grid of processors, stored by rows.
  *
  * p_asup_val    (Output) double**
- *         Numerical values of the upper part of A, distributed on a 
+ *         Numerical values of the upper part of A, distributed on a
  *         2D grid of processors, stored by rows.
  *
  * ilsum_i  (Input) int_t *
- *       Starting position of each supernode in 
+ *       Starting position of each supernode in
  *       the full array (local, block row wise).
  *
  * ilsum_j  (Input) int_t *
- *       Starting position of each supernode in 
+ *       Starting position of each supernode in
  *       the full array (local, block column wise).
  *
  * Return value
@@ -744,10 +744,10 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
  *        (an approximation).
  * </pre>
  */
- 
+
 static float
 ddist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
-	Glu_persist_t *Glu_persist, gridinfo_t *grid, 
+	Glu_persist_t *Glu_persist, gridinfo_t *grid,
 	int_t **p_ainf_colptr, int_t **p_ainf_rowind, double **p_ainf_val,
 	int_t **p_asup_rowptr, int_t **p_asup_colind, double **p_asup_val,
 	int_t *ilsum_i, int_t *ilsum_j
@@ -772,7 +772,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
   MPI_Request *send_req;
   MPI_Status  status;
   int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-  int_t *supno = Glu_persist->supno;   
+  int_t *supno = Glu_persist->supno;
   float memAux;  /* Memory used during this routine and freed on return */
   float memRet; /* Memory allocated and not freed on return */
   int_t iword, dword, szbuf;
@@ -786,7 +786,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
 #endif
   iword = sizeof(int_t);
   dword = sizeof(double);
-  
+
   perm_r = ScalePermstruct->perm_r;
   perm_c = ScalePermstruct->perm_c;
   procs = grid->nprow * grid->npcol;
@@ -801,7 +801,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
   memAux = (float) (2 * procs * iword);
   memRet = 0.;
   nnzToSend = nnzToRecv + procs;
-  nsupers  = supno[n-1] + 1;  
+  nsupers  = supno[n-1] + 1;
 
   /* ------------------------------------------------------------
      COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
@@ -815,17 +815,17 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
       gbi = BlockNum( irow );
       gbj = BlockNum( jcol );
       p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      ++nnzToSend[p]; 
+      ++nnzToSend[p];
     }
   }
-  
+
   /* All-to-all communication */
   MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
 		grid->comm);
-  
+
   maxnnzToRecv = 0;
   nnz_loc = SendCnt = RecvCnt = 0;
-  
+
   for (p = 0; p < procs; ++p) {
     if ( p != iam ) {
       SendCnt += nnzToSend[p];
@@ -851,7 +851,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
     return (memAux);
   }
   memAux += (float) (k*dword);
-  
+
   /* Allocate temporary storage for sending/receiving the A triplets. */
   if ( procs > 1 ) {
     if ( !(send_req = (MPI_Request *)
@@ -869,7 +869,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
       fprintf(stderr, "Malloc fails for aij_send[].");
       return (memAux);
     }
-    memAux += (float) (procs*sizeof(double*));    
+    memAux += (float) (procs*sizeof(double*));
     if ( !(index = intMalloc_dist(2*SendCnt)) ) {
       fprintf(stderr, "Malloc fails for index[].");
       return (memAux);
@@ -895,7 +895,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
       return (memAux);
     }
     memAux += (float) (maxnnzToRecv * dword);
-    
+
     for (i = 0, j = 0, p = 0; p < procs; ++p) {
       if ( p != iam ) {
 	ia_send[p] = &index[i];
@@ -905,7 +905,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
       }
     }
   } /* if procs > 1 */
-  
+
   nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
   nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
   if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) {
@@ -918,7 +918,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
     return (memAux+memRet);
   }
   memRet += (float) (ilsum_i[nsupers_i] + 1) * iword;
-  
+
   /* ------------------------------------------------------------
      LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
      THIS ACCOUNTS FOR THE SECOND PASS OF A.
@@ -933,13 +933,13 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
       gbi = BlockNum( irow );
       gbj = BlockNum( jcol );
       p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      
+
       if ( p != iam ) { /* remote */
 	k = ptr_to_send[p];
 	ia_send[p][k] = irow;
 	ia_send[p][k + nnzToSend[p]] = jcol;
 	aij_send[p][k] = nzval_a[j];
-	++ptr_to_send[p]; 
+	++ptr_to_send[p];
       } else {          /* local */
 	ia[nnz_loc] = irow;
 	ja[nnz_loc] = jcol;
@@ -969,14 +969,14 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
 		 p, iam, grid->comm, &send_req[p] );
       it = nnzToSend[p];
       MPI_Isend( aij_send[p], it, MPI_DOUBLE,
-		 p, iam+procs, grid->comm, &send_req[procs+p] ); 
+		 p, iam+procs, grid->comm, &send_req[procs+p] );
     }
   }
-  
+
   for (p = 0; p < procs; ++p) {
     if ( p != iam ) {
       it = 2*nnzToRecv[p];
-      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
+      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
       it = nnzToRecv[p];
       MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs,
 		grid->comm, &status );
@@ -988,7 +988,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
 	ja[nnz_loc] = jcol;
 	aij[nnz_loc] = dtemp[i];
 	++nnz_loc;
-	
+
 	gbi = BlockNum( irow );
 	gbj = BlockNum( jcol );
 	/* Count nonzeros in each column of L / row of U */
@@ -1003,18 +1003,18 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
       }
     }
   }
-  
+
   for (p = 0; p < procs; ++p) {
     if ( p != iam ) {
       MPI_Wait( &send_req[p], &status);
       MPI_Wait( &send_req[procs+p], &status);
     }
   }
-  
+
   /* ------------------------------------------------------------
      DEALLOCATE TEMPORARY STORAGE
      ------------------------------------------------------------*/
-  
+
   SUPERLU_FREE(nnzToRecv);
   memAux -= 2 * procs * iword;
   if ( procs > 1 ) {
@@ -1031,7 +1031,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
       SendCnt* dword + procs*iword +
       2*maxnnzToRecv*iword + maxnnzToRecv*dword;
   }
-  
+
   /* ------------------------------------------------------------
      CONVERT THE TRIPLET FORMAT.
      ------------------------------------------------------------*/
@@ -1069,11 +1069,11 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
   }
 
   /* Initialize the array of column pointers */
-  k = 0; 
-  jsize = ainf_colptr[0];  ainf_colptr[0] = 0; 
+  k = 0;
+  jsize = ainf_colptr[0];  ainf_colptr[0] = 0;
   for (j = 1; j < ilsum_j[nsupers_j]; j++) {
-    k += jsize;              
-    jsize = ainf_colptr[j];  
+    k += jsize;
+    jsize = ainf_colptr[j];
     ainf_colptr[j] = k;
   }
   ainf_colptr[ilsum_j[nsupers_j]] = k + jsize;
@@ -1081,7 +1081,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
   isize = asup_rowptr[0];  asup_rowptr[0] = 0;
   for (j = 1; j < ilsum_i[nsupers_i]; j++) {
     i += isize;
-    isize = asup_rowptr[j];  
+    isize = asup_rowptr[j];
     asup_rowptr[j] = i;
   }
   asup_rowptr[ilsum_i[nsupers_i]] = i + isize;
@@ -1110,19 +1110,19 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
   }
 
   /* Reset the column pointers to the beginning of each column */
-  for (j = ilsum_j[nsupers_j]; j > 0; j--) 
+  for (j = ilsum_j[nsupers_j]; j > 0; j--)
     ainf_colptr[j] = ainf_colptr[j-1];
-  for (j = ilsum_i[nsupers_i]; j > 0; j--) 
+  for (j = ilsum_i[nsupers_i]; j > 0; j--)
     asup_rowptr[j] = asup_rowptr[j-1];
   ainf_colptr[0] = 0;
   asup_rowptr[0] = 0;
-  
+
   SUPERLU_FREE(ia);
   SUPERLU_FREE(aij);
   memAux -= 2*szbuf*iword + szbuf*dword;
-  
+
   *p_ainf_colptr = ainf_colptr;
-  *p_ainf_rowind = ainf_rowind; 
+  *p_ainf_rowind = ainf_rowind;
   *p_ainf_val    = ainf_val;
   *p_asup_rowptr = asup_rowptr;
   *p_asup_colind = asup_colind;
@@ -1142,10 +1142,10 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
  * Purpose
  * =======
  *   Distribute the input matrix onto the 2D process mesh.
- * 
+ *
  * Arguments
  * =========
- * 
+ *
  * fact (input) fact_t
  *        Specifies whether or not the L and U structures will be re-used.
  *        = SamePattern_SameRowPerm: L and U structures are input, and
@@ -1168,7 +1168,7 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
  *
  * Glu_freeable (Input) *Glu_freeable_t
  *        The global structure describing the graph of L and U.
- * 
+ *
  * LUstruct (Input) LUstruct_t*
  *        Data structures for L and U factors.
  *
@@ -1187,22 +1187,22 @@ ddist_A(SuperMatrix *A, ScalePermstruct_
 float
 ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
 		ScalePermstruct_t *ScalePermstruct,
-		Pslu_freeable_t *Pslu_freeable, 
+		Pslu_freeable_t *Pslu_freeable,
 		LUstruct_t *LUstruct, gridinfo_t *grid)
 {
   Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
   Glu_freeable_t Glu_freeable_n;
   LocalLU_t *Llu = LUstruct->Llu;
-  int_t bnnz, fsupc, i, irow, istart, j, jb, ib, jj, k, k1, 
+  int_t bnnz, fsupc, i, irow, istart, j, jb, ib, jj, k, k1,
     len, len1, nsupc, nsupc_gb, ii, nprocs;
   int_t lib;  /* local block row number */
-  int_t nlb;  /* local block rows*/    
+  int_t nlb;  /* local block rows*/
   int_t ljb;  /* local block column number */
   int_t nrbl; /* number of L blocks in current block column */
   int_t nrbu; /* number of U blocks in current block column */
   int_t gb;   /* global block number; 0 < gb <= nsuper */
   int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-  int_t ub,gik,iklrow,fnz;   
+  int_t ub,gik,iklrow,fnz;
   int iam, jbrow, jbcol, jcol, kcol, krow, mycol, myrow, pc, pr, ljb_i, ljb_j, p;
   int_t mybufmax[NBUFFERS];
   NRformat_loc *Astore;
@@ -1222,45 +1222,45 @@ ddist_psymbtonum(fact_t fact, int_t n, S
   int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
   double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
   double **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-  double **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */  
+  double **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
   int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-  int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */	 
-  int_t *index_srt;         /* indices consist of headers and row subscripts */	
-  double *lusup_srt; /* nonzero values in L and U */    
+  int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+  int_t *index_srt;         /* indices consist of headers and row subscripts */
+  double *lusup_srt; /* nonzero values in L and U */
   double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
   int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
   int_t  *Unnz;  /* size ceil(NSUPERS/Pc) */
-  
+
   BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
   RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
   BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-  RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */	
+  RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
   int msgsize;
 
   int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
   Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-  int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */  
- 
- 
+  int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+
+
   /*-- Counts to be used in factorization. --*/
   int  *ToRecv, *ToSendD, **ToSendR;
-  
+
   /*-- Counts to be used in lower triangular solve. --*/
   int_t  *fmod;          /* Modification count for L-solve.        */
   int_t  **fsendx_plist; /* Column process list to send down Xk.   */
   int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
   int_t  nfsendx = 0;    /* Number of Xk I will send               */
   int_t  kseen;
-  
+
   /*-- Counts to be used in upper triangular solve. --*/
   int_t  *bmod;          /* Modification count for U-solve.        */
   int_t  **bsendx_plist; /* Column process list to send down Xk.   */
   int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nbsendx = 0;    /* Number of Xk I will send               */  
-  int_t  *ilsum;         /* starting position of each supernode in 
-			    the full array (local)                 */  
-  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in 
-				the full array (local, block column wise) */  
+  int_t  nbsendx = 0;    /* Number of Xk I will send               */
+  int_t  *ilsum;         /* starting position of each supernode in
+			    the full array (local)                 */
+  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in
+				the full array (local, block column wise) */
   /*-- Auxiliary arrays; freed on return --*/
   int_t *Urb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
   int_t *LUb_length; /* L,U block length; size nsupers_ij */
@@ -1282,31 +1282,31 @@ double *dense, *dense_col; /* SPA */
   int_t iword, dword;
   float mem_use = 0.0;
   int_t *mod_bit;
-  int_t *frecv, *brecv, *lloc; 
-  double *SeedSTD_BC,*SeedSTD_RD;				 
+  int_t *frecv, *brecv, *lloc;
+  double *SeedSTD_BC,*SeedSTD_RD;
   int_t idx_indx,idx_lusup;
   int_t nbrow;
   int_t  ik, il, lk, rel, knsupc, idx_r;
-  int_t  lptr1_tmp, idx_i, idx_v,m, uu;	
+  int_t  lptr1_tmp, idx_i, idx_v,m, uu;
   int_t	nub;
 
   float memStrLU, memA,
         memDist = 0.; /* memory used for redistributing the data, which does
 		         not include the memory for the numerical values
                          of L and U (positive number)*/
-  float  memNLU = 0.; /* memory allocated for storing the numerical values of 
+  float  memNLU = 0.; /* memory allocated for storing the numerical values of
 		         L and U, that will be used in the numeric
                          factorization (positive number) */
-  float  memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/		
-  
+  float  memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
+
 #if ( PRNTlevel>=1 )
   int_t nLblocks = 0, nUblocks = 0;
 #endif
-#if ( PROFlevel>=1 ) 
+#if ( PROFlevel>=1 )
 	double t, t_u, t_l;
 	int_t u_blks;
 #endif
-  
+
   /* Initialization. */
   iam = grid->iam;
 #if ( DEBUGlevel>=1 )
@@ -1317,27 +1317,27 @@ double *dense, *dense_col; /* SPA */
   nprocs = grid->npcol * grid->nprow;
   for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
   Astore   = (NRformat_loc *) A->Store;
-  
+
   iword = sizeof(int_t);
   dword = sizeof(double);
 
   if (fact == SamePattern_SameRowPerm) {
-    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");  
+    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");
   }
 
-  if ((memStrLU = 
-       dist_symbLU (n, Pslu_freeable, 
+  if ((memStrLU =
+       dist_symbLU (n, Pslu_freeable,
 		    Glu_persist, &xlsub, &lsub, &xusub, &usub,	grid)) > 0)
     return (memStrLU);
   memDist += (-memStrLU);
   xsup  = Glu_persist->xsup;    /* supernode and column mapping */
-  supno = Glu_persist->supno;   
+  supno = Glu_persist->supno;
   nsupers  = supno[n-1] + 1;
   nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */
   nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */
   nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j);
   if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) {
-    fprintf (stderr, "Malloc fails for ilsum[].");  
+    fprintf (stderr, "Malloc fails for ilsum[].");
     return (memDist + memNLU + memTRS);
   }
   memNLU += (nsupers_i+1) * iword;
@@ -1350,7 +1350,7 @@ double *dense, *dense_col; /* SPA */
   /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */
   ilsum[0] = 0;
   ldaspa = 0;
-  for (gb = 0; gb < nsupers; gb++) 
+  for (gb = 0; gb < nsupers; gb++)
     if ( myrow == PROW( gb, grid ) ) {
       i = SuperSize( gb );
       ldaspa += i;
@@ -1359,8 +1359,8 @@ double *dense, *dense_col; /* SPA */
     }
   ilsum[nsupers_i] = ldaspa;
 
-  ldaspa_j = 0; ilsum_j[0] = 0;  
-  for (gb = 0; gb < nsupers; gb++) 
+  ldaspa_j = 0; ilsum_j[0] = 0;
+  for (gb = 0; gb < nsupers; gb++)
     if (mycol == PCOL( gb, grid )) {
       i = SuperSize( gb );
       ldaspa_j += i;
@@ -1368,7 +1368,7 @@ double *dense, *dense_col; /* SPA */
       ilsum_j[lb + 1] = ilsum_j[lb] + i;
     }
   ilsum_j[nsupers_j] = ldaspa_j;
-  
+
   if ((memA = ddist_A(A, ScalePermstruct, Glu_persist,
 		      grid, &ainf_colptr, &ainf_rowind, &ainf_val,
 		      &asup_rowptr, &asup_colind, &asup_val,
@@ -1379,7 +1379,7 @@ double *dense, *dense_col; /* SPA */
   /* ------------------------------------------------------------
      FIRST TIME CREATING THE L AND U DATA STRUCTURES.
      ------------------------------------------------------------*/
-  
+
   /* We first need to set up the L and U data structures and then
    * propagate the values of A into them.
    */
@@ -1389,7 +1389,7 @@ double *dense, *dense_col; /* SPA */
   }
   for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
   memNLU += nsupers * iword;
-  
+
   k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
   if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) {
     fprintf(stderr, "Malloc fails for ToSendR[].");
@@ -1402,10 +1402,10 @@ double *dense, *dense_col; /* SPA */
     return (memDist + memNLU + memTRS);
   }
   memNLU += j*iword;
-  
+
   for (i = 0; i < j; ++i) index1[i] = EMPTY;
   for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
-  
+
   /* Auxiliary arrays used to set up L and U block data structures.
      They are freed on return. */
   if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
@@ -1419,16 +1419,16 @@ double *dense, *dense_col; /* SPA */
   if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) {
     fprintf(stderr, "Calloc fails for LUb_number[].");
     return (memDist + memNLU + memTRS);
-  }    
+  }
   if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) {
     fprintf(stderr, "Calloc fails for LUb_valptr[].");
     return (memDist + memNLU + memTRS);
   }
   memDist += 4 * nsupers_ij * iword;
-  
-  k = CEILING( nsupers, grid->nprow ); 
+
+  k = CEILING( nsupers, grid->nprow );
   /* Pointers to the beginning of each block row of U. */
-  if ( !(Unzval_br_ptr = 
+  if ( !(Unzval_br_ptr =
 	 (double**)SUPERLU_MALLOC(nsupers_i * sizeof(double*))) ) {
     fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
     return (memDist + memNLU + memTRS);
@@ -1447,7 +1447,7 @@ double *dense, *dense_col; /* SPA */
   }
   for (i = 0; i < nsupers_i; ++i) ToSendD[i] = NO;
 
-  memNLU += nsupers_i*iword;  
+  memNLU += nsupers_i*iword;
   if ( !(Urb_marker = intCalloc_dist(nsupers_j))) {
     fprintf(stderr, "Calloc fails for rb_marker[].");
     return (memDist + memNLU + memTRS);
@@ -1457,11 +1457,11 @@ double *dense, *dense_col; /* SPA */
     return (memDist + memNLU + memTRS);
   }
   memDist += (nsupers_i + nsupers_j)*iword;
-  
+
   /* Auxiliary arrays used to set up L, U block data structures.
      They are freed on return.
      k is the number of local row blocks.   */
-  if ( !(dense = doubleCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) 
+  if ( !(dense = doubleCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j)
 				   * sp_ienv_dist(3))) ) {
     fprintf(stderr, "Calloc fails for SPA dense[].");
     return (memDist + memNLU + memTRS);
@@ -1476,11 +1476,11 @@ double *dense, *dense_col; /* SPA */
     return (memDist + memNLU + memTRS);
   }
   /* ------------------------------------------------ */
-  memNLU += 2*nsupers_i*iword + 
-    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; 
-  
+  memNLU += 2*nsupers_i*iword +
+    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword;
+
   /* Pointers to the beginning of each block column of L. */
-  if ( !(Lnzval_bc_ptr = 
+  if ( !(Lnzval_bc_ptr =
 	 (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) {
     fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[].");
     return (memDist + memNLU + memTRS);
@@ -1489,35 +1489,35 @@ double *dense, *dense_col; /* SPA */
     fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
     return (memDist + memNLU + memTRS);
   }
- 
-  if ( !(Linv_bc_ptr = 
+
+  if ( !(Linv_bc_ptr =
 			(double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) {
 	fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
 	return (memDist + memNLU + memTRS);
-  }  
-  if ( !(Uinv_bc_ptr = 
+  }
+  if ( !(Uinv_bc_ptr =
 			(double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) {
 	fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
 	return (memDist + memNLU + memTRS);
-  }   
+  }
   if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){
     fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[].");
     return (memDist + memNLU + memTRS);
-  }  
-  
+  }
+
   if ( !(Unnz = (int_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int_t))) ){
     fprintf(stderr, "Malloc fails for Unnz[].");
     return (memDist + memNLU + memTRS);
-  }    
-  memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double*) + nsupers_j*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr    
-  
+  }
+  memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double*) + nsupers_j*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
+
   memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*);
   Lnzval_bc_ptr[nsupers_j-1] = NULL;
   Lrowind_bc_ptr[nsupers_j-1] = NULL;
   Linv_bc_ptr[nsupers_j-1] = NULL;
   Uinv_bc_ptr[nsupers_j-1] = NULL;
-  Lindval_loc_bc_ptr[nsupers_j-1] = NULL;  
-  
+  Lindval_loc_bc_ptr[nsupers_j-1] = NULL;
+
   /* These lists of processes will be used for triangular solves. */
   if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
     fprintf(stderr, "Malloc fails for fsendx_plist[].");
@@ -1544,7 +1544,7 @@ double *dense, *dense_col; /* SPA */
     bsendx_plist[i] = &index[j];
   /* -------------------------------------------------------------- */
   memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
-  
+
   /*------------------------------------------------------------
     PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
     THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
@@ -1556,12 +1556,12 @@ double *dense, *dense_col; /* SPA */
     ljb_i = LBi( jb, grid);  /* Local block number row wise */
     fsupc = FstBlockC( jb );
     nsupc = SuperSize( jb );
-    
+
     if ( myrow == jbrow ) { /* Block row jb in my process row */
       /* Scatter A into SPA. */
       for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
 	for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
-	  if (i >= asup_rowptr[ilsum[nsupers_i]]) 
+	  if (i >= asup_rowptr[ilsum[nsupers_i]])
 	    printf ("ERR7\n");
 	  jcol = asup_colind[i];
 	  if (jcol >= n)
@@ -1578,7 +1578,7 @@ double *dense, *dense_col; /* SPA */
 	}
 	dense_col += ldaspa_j;
       }
-      
+
       /*------------------------------------------------
        * SET UP U BLOCKS.
        *------------------------------------------------*/
@@ -1590,18 +1590,18 @@ double *dense, *dense_col; /* SPA */
 	if (i >= xusub[nsupers_i]) printf ("ERR10\n");
 	jcol = usub[i];
 	gb = BlockNum( jcol ); /* Global block number */
-	
+
 	/*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986)
 	  printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n",
 	  iam, jb, gb, jcol, jbcol, pc); */
-	
+
 	lb = LBj( gb, grid );  /* Local block number */
 	pc = PCOL( gb, grid ); /* Process col owning this block */
 	if (mycol == jbcol) ToSendR[ljb_j][pc] = YES;
 	/* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */
 	pr = PROW( gb, grid );
 	if ( pr != jbrow  && mycol == pc)
-	  bsendx_plist[lb][jbrow] = YES; 
+	  bsendx_plist[lb][jbrow] = YES;
 	if (mycol == pc) {
 	  len += nsupc;
 	  LUb_length[lb] += nsupc;
@@ -1623,8 +1623,8 @@ double *dense, *dense_col; /* SPA */
 	  }
 	}
       } /* for i ... */
-      
-      if ( nrbu ) { 
+
+      if ( nrbu ) {
 	/* Sort the blocks of U in increasing block column index.
 	   SuperLU_DIST assumes this is true */
 	/* simple insert sort algorithm */
@@ -1635,8 +1635,8 @@ double *dense, *dense_col; /* SPA */
 	    LUb_number[i+1] = LUb_number[i];
 	  }
 	  LUb_number[i+1] = k;
-	} 
-	
+	}
+
 	/* Set up the initial pointers for each block in
 	   index[] and nzval[]. */
 	/* Add room for descriptors */
@@ -1679,17 +1679,17 @@ double *dense, *dense_col; /* SPA */
 	for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
 	  jcol = usub[i];
 	  gb = BlockNum( jcol );
-	  
+
 	  if ( mycol == PCOL( gb, grid ) ) {
 	    lb = LBj( gb, grid );
 	    k = LUb_indptr[lb]; /* Start fstnz in index */
 	    index[k + jcol - FstBlockC( gb )] = FstBlockC( jb );
 	  }
 	}  /* for i ... */
-	
+
 	for (i = 0; i < nrbu; i++) {
 	  gb = LUb_number[i];
-	  lb = LBj( gb, grid );   
+	  lb = LBj( gb, grid );
 	  next_ind = LUb_indptr[lb];
 	  k = FstBlockC( jb + 1);
 	  jcol = ilsum_j[lb];
@@ -1699,16 +1699,16 @@ double *dense, *dense_col; /* SPA */
 	    for (ii = j; ii < k; ii++) {
 	      uval[LUb_valptr[lb]++] = dense_col[jcol];
 	      dense_col[jcol] = zero;
-	      dense_col += ldaspa_j;	      
+	      dense_col += ldaspa_j;
 	    }
 	  }
 	}
       } else {
 	Ufstnz_br_ptr[ljb_i] = NULL;
 	Unzval_br_ptr[ljb_i] = NULL;
-      } /* if nrbu ... */	
+      } /* if nrbu ... */
     } /* if myrow == jbrow */
-    
+
       /*------------------------------------------------
        * SET UP L BLOCKS.
        *------------------------------------------------*/
@@ -1728,8 +1728,8 @@ double *dense, *dense_col; /* SPA */
 	  }
 	}
 	dense_col += ldaspa;
-      }      
-      
+      }
+
       /* sort the indices of the diagonal block at the beginning of xlsub */
       if (myrow == jbrow) {
 	k = xlsub[ljb_j];
@@ -1742,14 +1742,14 @@ double *dense, *dense_col; /* SPA */
 	  }
 	}
       }
-      
+
       /* Count number of blocks and length of each block. */
       nrbl = 0;
       len = 0; /* Number of row subscripts I own. */
       kseen = 0;
       for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
 	irow = lsub[i];
-	gb = BlockNum( irow ); /* Global block number */	  
+	gb = BlockNum( irow ); /* Global block number */
 	pr = PROW( gb, grid ); /* Process row owning this block */
 	if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY &&
 	     myrow == jbrow) {
@@ -1771,14 +1771,14 @@ double *dense, *dense_col; /* SPA */
 #if ( PRNTlevel>=1 )
 	    ++nLblocks;
 #endif
-	  } else 
-	    ++LUb_length[lb];	    
+	  } else
+	    ++LUb_length[lb];
 	  ++len;
 	}
       } /* for i ... */
-      
+
       if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-	/* Set up the initial pointers for each block in 
+	/* Set up the initial pointers for each block in
 	   index[] and nzval[]. */
 	/* If I am the owner of the diagonal block, order it first in LUb_number.
 	   Necessary for SuperLU_DIST routines */
@@ -1791,7 +1791,7 @@ double *dense, *dense_col; /* SPA */
 	  LUb_number[kseen] = LUb_number[0];
 	  LUb_number[0] = jb;
 	}
-	
+
 	/* Add room for descriptors */
 	len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
 	if ( !(index = intMalloc_dist(len1)) ) {
@@ -1799,23 +1799,23 @@ double *dense, *dense_col; /* SPA */
 	  return (memDist + memNLU + memTRS);
 	}
 	Lrowind_bc_ptr[ljb_j] = index;
-	if (!(Lnzval_bc_ptr[ljb_j] = 
+	if (!(Lnzval_bc_ptr[ljb_j] =
 	      doubleMalloc_dist(len*nsupc))) {
 	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb);
 	  return (memDist + memNLU + memTRS);
 	}
-	
+
 	if (!(Linv_bc_ptr[ljb_j] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
 		ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]");
 	if (!(Uinv_bc_ptr[ljb_j] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
-		ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]");	
-	
+		ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]");
+
 	memNLU += len1*iword + len*nsupc*dword;
 
-	if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(nrbl*3))) 
+	if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(nrbl*3)))
 		ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]");
-	memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]	
-	
+	memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
+
 	lusup = Lnzval_bc_ptr[ljb_j];
 	mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
 	mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
@@ -1828,14 +1828,14 @@ double *dense, *dense_col; /* SPA */
 	  gb = LUb_number[k];
 	  lb = LBi( gb, grid );
 	  len = LUb_length[lb];
-	  
+
 	  Lindval_loc_bc_ptr[ljb_j][k] = lb;
 	  Lindval_loc_bc_ptr[ljb_j][k+nrbl] = next_ind;
-	  Lindval_loc_bc_ptr[ljb_j][k+nrbl*2] = next_val;			  
-	 	  
+	  Lindval_loc_bc_ptr[ljb_j][k+nrbl*2] = next_val;
+
 	  LUb_length[lb] = 0;
 	  index[next_ind++] = gb; /* Descriptor */
-	  index[next_ind++] = len; 
+	  index[next_ind++] = len;
 	  LUb_indptr[lb] = next_ind;
 	    LUb_valptr[lb] = next_val;
 	    next_ind += len;
@@ -1861,8 +1861,8 @@ double *dense, *dense_col; /* SPA */
 	      }
 	    }
 	  } /* for i ... */
-	  
-	  
+
+
 
 		/* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/
 		if(nrbl>1){
@@ -1871,18 +1871,18 @@ double *dense, *dense_col; /* SPA */
 				uu=nrbl-2;
 				lloc = &Lindval_loc_bc_ptr[ljb_j][1];
 			}else{
-				uu=nrbl-1;	
+				uu=nrbl-1;
 				lloc = Lindval_loc_bc_ptr[ljb_j];
-			}	
-			quickSortM(lloc,0,uu,nrbl,0,3);	
+			}
+			quickSortM(lloc,0,uu,nrbl,0,3);
 		}
 
 
-		if ( !(index_srt = intMalloc_dist(len1)) ) 
-			ABORT("Malloc fails for index_srt[]");				
+		if ( !(index_srt = intMalloc_dist(len1)) )
+			ABORT("Malloc fails for index_srt[]");
 		if (!(lusup_srt = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double))))
 			ABORT("Malloc fails for lusup_srt[]");
-				
+
 		idx_indx = BC_HEADER;
 		idx_lusup = 0;
 		for (jj=0;jj<BC_HEADER;jj++)
@@ -1894,33 +1894,33 @@ double *dense, *dense_col; /* SPA */
 				index_srt[idx_indx++] = index[Lindval_loc_bc_ptr[ljb_j][i+nrbl]+jj];
 			}
 
-			Lindval_loc_bc_ptr[ljb_j][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow; 
+			Lindval_loc_bc_ptr[ljb_j][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow;
 
 			for (jj=0;jj<nbrow;jj++){
 				k=idx_lusup;
 				k1=Lindval_loc_bc_ptr[ljb_j][i+nrbl*2]+jj;
-				for (j = 0; j < nsupc; ++j) {				
+				for (j = 0; j < nsupc; ++j) {
 					lusup_srt[k] = lusup[k1];
 					k += len;
 					k1 += len;
-				}	
+				}
 				idx_lusup++;
-			}				
-			Lindval_loc_bc_ptr[ljb_j][i+nrbl*2] = idx_lusup - nbrow;	
+			}
+			Lindval_loc_bc_ptr[ljb_j][i+nrbl*2] = idx_lusup - nbrow;
 		}
 
 		SUPERLU_FREE(lusup);
 		SUPERLU_FREE(index);
 
 		Lrowind_bc_ptr[ljb_j] = index_srt;
-		Lnzval_bc_ptr[ljb_j] = lusup_srt; 			
+		Lnzval_bc_ptr[ljb_j] = lusup_srt;
 	} else {
 	  Lrowind_bc_ptr[ljb_j] = NULL;
 	  Lnzval_bc_ptr[ljb_j] = NULL;
 	  Linv_bc_ptr[ljb_j] = NULL;
 	  Uinv_bc_ptr[ljb_j] = NULL;
-	  Lindval_loc_bc_ptr[ljb_j] = NULL;	  
-	} /* if nrbl ... */		  
+	  Lindval_loc_bc_ptr[ljb_j] = NULL;
+	} /* if nrbl ... */
       } /* if mycol == pc */
   } /* for jb ... */
 
@@ -1932,7 +1932,7 @@ double *dense, *dense_col; /* SPA */
   SUPERLU_FREE(LUb_valptr);
   SUPERLU_FREE(Lrb_marker);
   SUPERLU_FREE(dense);
-  
+
   /* Free the memory used for storing A */
   SUPERLU_FREE(ainf_colptr);
   if (ainf_rowind != NULL) {
@@ -1941,10 +1941,10 @@ double *dense, *dense_col; /* SPA */
   }
   SUPERLU_FREE(asup_rowptr);
   if (asup_colind != NULL) {
-    SUPERLU_FREE(asup_colind);	
-    SUPERLU_FREE(asup_val);	
+    SUPERLU_FREE(asup_colind);
+    SUPERLU_FREE(asup_val);
   }
-  
+
   /* exchange information about bsendx_plist in between column of processors */
   k = SUPERLU_MAX( grid->nprow, grid->npcol);
   if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
@@ -1967,19 +1967,19 @@ double *dense, *dense_col; /* SPA */
     fprintf (stderr, "Malloc fails for ptrToRecv[].");
     return (memDist + memNLU + memTRS);
   }
-  
+
   if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int)))
     memDist = nsupers*k*iword +4*nprocs * sizeof(int);
-  
+
   for (p = 0; p < nprocs; p++)
     nnzToRecv[p] = 0;
-  
+
   for (jb = 0; jb < nsupers; jb++) {
     jbcol = PCOL( jb, grid );
     jbrow = PROW( jb, grid );
     p = PNUM(jbrow, jbcol, grid);
     nnzToRecv[p] += grid->npcol;
-  }    
+  }
   i = 0;
   for (p = 0; p < nprocs; p++) {
     ptrToRecv[p] = i;
@@ -1997,21 +1997,21 @@ double *dense, *dense_col; /* SPA */
     jbrow = PROW( jb, grid );
     p = PNUM(jbrow, jbcol, grid);
     if (p == iam) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
+      ljb_j = LBj( jb, grid ); /* Local block number column wise */
       for (j = 0; j < grid->npcol; j++, i++)
 	recvBuf[i] = ToSendR[ljb_j][j];
     }
-  }   
-  
+  }
+
   MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
 		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
-  
+
   for (jb = 0; jb < nsupers; jb++) {
     jbcol = PCOL( jb, grid );
     jbrow = PROW( jb, grid );
     p = PNUM(jbrow, jbcol, grid);
-    ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-    ljb_i = LBi( jb, grid ); /* Local block number row wise */	
+    ljb_j = LBj( jb, grid ); /* Local block number column wise */
+    ljb_i = LBi( jb, grid ); /* Local block number row wise */
     /* (myrow == jbrow) {
        if (ToSendD[ljb_i] == YES)
        ToRecv[jb] = 1;
@@ -2027,22 +2027,22 @@ double *dense, *dense_col; /* SPA */
 	ToRecv[jb] = 2;
     }
     if (mycol == jbcol) {
-      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) 
-	ToSendR[ljb_j][i] = recvBuf[j];  
+      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++)
+	ToSendR[ljb_j][i] = recvBuf[j];
       ToSendR[ljb_j][mycol] = EMPTY;
     }
     ptrToRecv[p] += grid->npcol;
-  }   
-  
+  }
+
   /* exchange information about bsendx_plist in between column of processors */
   MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
 		 MPI_MAX, grid->cscp.comm);
-  
+
   for (jb = 0; jb < nsupers; jb ++) {
     jbcol = PCOL( jb, grid);
     jbrow = PROW( jb, grid);
     if (mycol == jbcol) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
+      ljb_j = LBj( jb, grid ); /* Local block number column wise */
       if (myrow == jbrow ) {
 	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) {
 	  (*bsendx_plist)[k] = recvBuf[k];
@@ -2051,14 +2051,14 @@ double *dense, *dense_col; /* SPA */
 	}
       }
       else {
-	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) 
+	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++)
 	  (*bsendx_plist)[k] = EMPTY;
       }
     }
   }
 
 		/////////////////////////////////////////////////////////////////
-		
+
 		/* Set up additional pointers for the index and value arrays of U.
 		   nub is the number of local block columns. */
 		nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */
@@ -2072,7 +2072,7 @@ double *dense, *dense_col; /* SPA */
 			ABORT("Malloc fails for Ucb_valptr[]");
 		nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
 
-		/* Count number of row blocks in a block column. 
+		/* Count number of row blocks in a block column.
 		   One pass of the skeleton graph of U. */
 		for (lk = 0; lk < nlb; ++lk) {
 			usub1 = Ufstnz_br_ptr[lk];
@@ -2111,21 +2111,21 @@ double *dense, *dense_col; /* SPA */
 
 					Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
 					Ucb_valptr[ljb][Urbs1[ljb]] = j;
-					
+
 					++Urbs1[ljb];
 					j += usub1[i+1];
 					i += UB_DESCRIPTOR + SuperSize( k );
 				}
 			}
-		}			
-		
-		
+		}
+
 
-/* Count the nnzs per block column */	
+
+/* Count the nnzs per block column */
 	for (lb = 0; lb < nub; ++lb) {
 		Unnz[lb] = 0;
 		k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
-		knsupc = SuperSize( k );	
+		knsupc = SuperSize( k );
 		for (ub = 0; ub < Urbs[lb]; ++ub) {
 			ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */
 			i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */
@@ -2139,47 +2139,47 @@ double *dense, *dense_col; /* SPA */
 				}
 			} /* for jj ... */
 		}
-	}						
-		
+	}
+
 		/////////////////////////////////////////////////////////////////
 
 		// if(LSUM<nsupers)ABORT("Need increase LSUM."); /* temporary*/
 
 #if ( PROFlevel>=1 )
 			t = SuperLU_timer_();
-#endif				
+#endif
 		/* construct the Bcast tree for L ... */
 
 		k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
 		if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 			ABORT("Malloc fails for LBtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-			ABORT("Calloc fails for ActiveFlag[].");	
+			ABORT("Calloc fails for ActiveFlag[].");
 		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-			ABORT("Malloc fails for ranks[].");	
+			ABORT("Malloc fails for ranks[].");
 		if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-			ABORT("Malloc fails for SeedSTD_BC[].");	
+			ABORT("Malloc fails for SeedSTD_BC[].");
 
 		for (i=0;i<k;i++){
-			SeedSTD_BC[i]=rand();		
+			SeedSTD_BC[i]=rand();
 		}
 
-		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 		for (ljb = 0; ljb <k ; ++ljb) {
 			LBtree_ptr[ljb]=NULL;
-		}			
-		
+		}
+
 
 		if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-			ABORT("Calloc fails for ActiveFlag[].");				
-		for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;	
-		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll		
+			ABORT("Calloc fails for ActiveFlag[].");
+		for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;
+		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
 		for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 			jb = mycol+ljb*grid->npcol;  /* not sure */
 			if(jb<nsupers){
 			pc = PCOL( jb, grid );
-			
+
 			istart = xlsub[ljb];
 			for (i = istart; i < xlsub[ljb+1]; ++i) {
 				irow = lsub[i];
@@ -2188,15 +2188,15 @@ double *dense, *dense_col; /* SPA */
 				ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb);
 			} /* for j ... */
 			}
-		}			
+		}
+
+
+		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm);
+
+
 
-		
-		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm);					  
-		
-		
-		
 		for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
-			
+
 			jb = mycol+ljb*grid->npcol;  /* not sure */
 			if(jb<nsupers){
 			pc = PCOL( jb, grid );
@@ -2205,19 +2205,19 @@ double *dense, *dense_col; /* SPA */
 			for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 			for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 			for (j=0;j<grid->nprow;++j){
 				if(ActiveFlag[j]!=3*nsupers){
 				gb = ActiveFlag[j];
 				pr = PROW( gb, grid );
 				if(gb==jb)Root=pr;
-				if(myrow==pr)Iactive=1;		
-				}					
+				if(myrow==pr)Iactive=1;
+				}
 			}
-			
 
-			quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);	
+
+			quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);
 
 			if(Iactive==1){
 				// printf("jb %5d damn\n",jb);
@@ -2230,7 +2230,7 @@ double *dense, *dense_col; /* SPA */
 						ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 						++rank_cnt;
 					}
-				}		
+				}
 
 				if(rank_cnt>1){
 
@@ -2240,7 +2240,7 @@ double *dense, *dense_col; /* SPA */
 					// rseed=rand();
 					// rseed=1.0;
 					msgsize = SuperSize( jb );
-					LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');  	
+					LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
 					BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d');
 
 					// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
@@ -2251,15 +2251,15 @@ double *dense, *dense_col; /* SPA */
 					// fflush(stdout);
 					// }
 
-					// #if ( PRNTlevel>=1 )		
+					// #if ( PRNTlevel>=1 )
 					if(Root==myrow){
 						rank_cnt_ref=1;
 						for (j = 0; j < grid->nprow; ++j) {
-							if ( fsendx_plist[ljb][j] != EMPTY ) {	
-								++rank_cnt_ref;		
+							if ( fsendx_plist[ljb][j] != EMPTY ) {
+								++rank_cnt_ref;
 							}
 						}
-						assert(rank_cnt==rank_cnt_ref);		
+						assert(rank_cnt==rank_cnt_ref);
 
 						// printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt);
 
@@ -2268,27 +2268,27 @@ double *dense, *dense_col; /* SPA */
 						// // printf("\n");
 					}
 					// #endif
-				}	
+				}
 			}
 			}
 		}
 
-		
+
 		SUPERLU_FREE(ActiveFlag);
 		SUPERLU_FREE(ActiveFlagAll);
 		SUPERLU_FREE(ranks);
 		SUPERLU_FREE(SeedSTD_BC);
-		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll		
-		
+		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
+
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
-#endif			
+#endif
 
 
 #if ( PROFlevel>=1 )
 			t = SuperLU_timer_();
-#endif			
+#endif
 		/* construct the Reduce tree for L ... */
 		/* the following is used as reference */
 		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -2317,37 +2317,37 @@ double *dense, *dense_col; /* SPA */
 		if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 			ABORT("Malloc fails for LRtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-			ABORT("Calloc fails for ActiveFlag[].");	
+			ABORT("Calloc fails for ActiveFlag[].");
 		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-			ABORT("Malloc fails for ranks[].");	
+			ABORT("Malloc fails for ranks[].");
 
 		// if ( !(idxs = intCalloc_dist(nsupers)) )
-			// ABORT("Calloc fails for idxs[].");	
+			// ABORT("Calloc fails for idxs[].");
 
 		// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 			// ABORT("Malloc fails for nzrows[].");
 
 		if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-			ABORT("Malloc fails for SeedSTD_RD[].");	
+			ABORT("Malloc fails for SeedSTD_RD[].");
 
 		for (i=0;i<k;i++){
-			SeedSTD_RD[i]=rand();		
+			SeedSTD_RD[i]=rand();
 		}
 
-		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 
 		for (lib = 0; lib <k ; ++lib) {
 			LRtree_ptr[lib]=NULL;
 		}
 
-		
+
 		if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-			ABORT("Calloc fails for ActiveFlagAll[].");				
-		for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll					
-			
-			
+			ABORT("Calloc fails for ActiveFlagAll[].");
+		for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
+		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
+
+
 		for (ljb = 0; ljb < CEILING( nsupers, grid->npcol); ++ljb) { /* for each local block column ... */
 			jb = mycol+ljb*grid->npcol;  /* not sure */
 			if(jb<nsupers){
@@ -2365,7 +2365,7 @@ double *dense, *dense_col; /* SPA */
 		}
 
 		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MAX,grid->rscp.comm);
-		
+
 		for (lib=0;lib<k;++lib){
 			ib = myrow+lib*grid->nprow;  /* not sure */
 			if(ib<nsupers){
@@ -2373,19 +2373,19 @@ double *dense, *dense_col; /* SPA */
 				for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 				for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 				for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-				Root=-1; 
-				Iactive = 0;				
+				Root=-1;
+				Iactive = 0;
 
 				for (j=0;j<grid->npcol;++j){
 					if(ActiveFlag[j]!=-3*nsupers){
 					jb = ActiveFlag[j];
 					pc = PCOL( jb, grid );
 					if(jb==ib)Root=pc;
-					if(mycol==pc)Iactive=1;		
-					}					
+					if(mycol==pc)Iactive=1;
+					}
 				}
-			
-			
+
+
 				quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2);
 
 				if(Iactive==1){
@@ -2401,7 +2401,7 @@ double *dense, *dense_col; /* SPA */
 					if(rank_cnt>1){
 
 						for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-							ranks[ii] = PNUM( pr, ranks[ii], grid );		
+							ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 						// rseed=rand();
 						// rseed=1.0;
@@ -2409,7 +2409,7 @@ double *dense, *dense_col; /* SPA */
 
 						// if(ib==0){
 
-						LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');  	
+						LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
 						RdTree_SetTag(LRtree_ptr[lib], RD_L,'d');
 						// }
 
@@ -2425,10 +2425,10 @@ double *dense, *dense_col; /* SPA */
 						// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 						// printf("\n");
 						}
-						#endif		
+						#endif
 					}
-				}				
-			}	
+				}
+			}
 		}
 
 		SUPERLU_FREE(mod_bit);
@@ -2437,9 +2437,9 @@ double *dense, *dense_col; /* SPA */
 
 		SUPERLU_FREE(ActiveFlag);
 		SUPERLU_FREE(ActiveFlagAll);
-		SUPERLU_FREE(ranks);	
-		// SUPERLU_FREE(idxs);	 
-		SUPERLU_FREE(SeedSTD_RD);	
+		SUPERLU_FREE(ranks);
+		// SUPERLU_FREE(idxs);
+		SUPERLU_FREE(SeedSTD_RD);
 		// for(i=0;i<nsupers;++i){
 			// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 		// }
@@ -2450,11 +2450,11 @@ double *dense, *dense_col; /* SPA */
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif	
+#endif
 
 		/* construct the Bcast tree for U ... */
 
@@ -2462,35 +2462,35 @@ double *dense, *dense_col; /* SPA */
 		if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 			ABORT("Malloc fails for UBtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-			ABORT("Calloc fails for ActiveFlag[].");	
+			ABORT("Calloc fails for ActiveFlag[].");
 		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-			ABORT("Malloc fails for ranks[].");	
+			ABORT("Malloc fails for ranks[].");
 		if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-			ABORT("Malloc fails for SeedSTD_BC[].");	
+			ABORT("Malloc fails for SeedSTD_BC[].");
 
 		for (i=0;i<k;i++){
-			SeedSTD_BC[i]=rand();		
+			SeedSTD_BC[i]=rand();
 		}
 
-		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 
 		for (ljb = 0; ljb <k ; ++ljb) {
 			UBtree_ptr[ljb]=NULL;
-		}	
+		}
 
 		if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-			ABORT("Calloc fails for ActiveFlagAll[].");				
-		for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll				
-		
-		
+			ABORT("Calloc fails for ActiveFlagAll[].");
+		for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
+		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
+
+
 		for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */
 			ib = myrow+lib*grid->nprow;  /* not sure */
-			
+
 		// if(ib==0)printf("iam %5d ib %5d\n",iam,ib);
-		// fflush(stdout);				
-			
+		// fflush(stdout);
+
 			if(ib<nsupers){
 				for (i = xusub[lib]; i < xusub[lib+1]; i++) {
 				  jcol = usub[i];
@@ -2498,26 +2498,26 @@ double *dense, *dense_col; /* SPA */
 				  ljb = LBj( jb, grid );    /* local block number */
 				  pc = PCOL( jb, grid );
 				  pr = PROW( ib, grid );
-				  if ( mycol == pc ) { /* Block column ib in my process column */		
-					ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib);			  
+				  if ( mycol == pc ) { /* Block column ib in my process column */
+					ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib);
 				  }
 				}  /* for i ... */
 				pr = PROW( ib, grid ); // take care of diagonal node stored as L
 				pc = PCOL( ib, grid );
-				if ( mycol == pc ) { /* Block column ib in my process column */					
+				if ( mycol == pc ) { /* Block column ib in my process column */
 					ljb = LBj( ib, grid );    /* local block number */
-					ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib);					
+					ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib);
 					// if(pr+ljb*grid->nprow==0)printf("iam %5d ib %5d ActiveFlagAll %5d pr %5d ljb %5d\n",iam,ib,ActiveFlagAll[pr+ljb*grid->nprow],pr,ljb);
-					// fflush(stdout);	
-				}					
-			}	
+					// fflush(stdout);
+				}
+			}
 		}
-		
+
 		// printf("iam %5d ActiveFlagAll %5d\n",iam,ActiveFlagAll[0]);
 		// fflush(stdout);
-		
-		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm);					  
-					
+
+		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm);
+
 		for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */
 			jb = mycol+ljb*grid->npcol;  /* not sure */
 			if(jb<nsupers){
@@ -2528,18 +2528,18 @@ double *dense, *dense_col; /* SPA */
 			for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 			for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 			for (j=0;j<grid->nprow;++j){
 				if(ActiveFlag[j]!=-3*nsupers){
 				gb = ActiveFlag[j];
 				pr = PROW( gb, grid );
 				if(gb==jb)Root=pr;
-				if(myrow==pr)Iactive=1;		
+				if(myrow==pr)Iactive=1;
 				}
-			}						
-			
-			quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);	
+			}
+
+			quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);
 		// printf("jb: %5d Iactive %5d\n",jb,Iactive);
 		// fflush(stdout);
 			if(Iactive==1){
@@ -2553,7 +2553,7 @@ double *dense, *dense_col; /* SPA */
 						ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 						++rank_cnt;
 					}
-				}		
+				}
 		// printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt);
 		// fflush(stdout);
 				if(rank_cnt>1){
@@ -2563,43 +2563,43 @@ double *dense, *dense_col; /* SPA */
 					// rseed=rand();
 					// rseed=1.0;
 					msgsize = SuperSize( jb );
-					UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');  	
+					UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
 					BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d');
 
 					// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 					// fflush(stdout);
-					
+
 					if(Root==myrow){
 					rank_cnt_ref=1;
 					for (j = 0; j < grid->nprow; ++j) {
 						// printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow);
 						// fflush(stdout);
-						if ( bsendx_plist[ljb][j] != EMPTY ) {	
-							++rank_cnt_ref;		
+						if ( bsendx_plist[ljb][j] != EMPTY ) {
+							++rank_cnt_ref;
 						}
 					}
 					// printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref);
-					// fflush(stdout);								
-					assert(rank_cnt==rank_cnt_ref);		
-					}						
+					// fflush(stdout);
+					assert(rank_cnt==rank_cnt_ref);
+					}
 				}
 			}
 			}
-		}	
+		}
 		SUPERLU_FREE(ActiveFlag);
 		SUPERLU_FREE(ActiveFlagAll);
-		SUPERLU_FREE(ranks);				
-		SUPERLU_FREE(SeedSTD_BC);				
-		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll		
-			
+		SUPERLU_FREE(ranks);
+		SUPERLU_FREE(SeedSTD_BC);
+		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
+
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 			t = SuperLU_timer_();
-#endif					
+#endif
 		/* construct the Reduce tree for U ... */
 		/* the following is used as reference */
 		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -2628,35 +2628,35 @@ double *dense, *dense_col; /* SPA */
 		if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 			ABORT("Malloc fails for URtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-			ABORT("Calloc fails for ActiveFlag[].");	
+			ABORT("Calloc fails for ActiveFlag[].");
 		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-			ABORT("Malloc fails for ranks[].");	
+			ABORT("Malloc fails for ranks[].");
 
 		// if ( !(idxs = intCalloc_dist(nsupers)) )
-			// ABORT("Calloc fails for idxs[].");	
+			// ABORT("Calloc fails for idxs[].");
 
 		// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 			// ABORT("Malloc fails for nzrows[].");
 
 		if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-			ABORT("Malloc fails for SeedSTD_RD[].");	
+			ABORT("Malloc fails for SeedSTD_RD[].");
 
 		for (i=0;i<k;i++){
-			SeedSTD_RD[i]=rand();		
+			SeedSTD_RD[i]=rand();
 		}
 
-		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 		for (lib = 0; lib <k ; ++lib) {
 			URtree_ptr[lib]=NULL;
 		}
 
-		
+
 		if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-			ABORT("Calloc fails for ActiveFlagAll[].");				
-		for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;	
-		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll				
-				
+			ABORT("Calloc fails for ActiveFlagAll[].");
+		for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;
+		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
+
 		for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */
 			ib = myrow+lib*grid->nprow;  /* not sure */
 			if(ib<nsupers){
@@ -2664,19 +2664,19 @@ double *dense, *dense_col; /* SPA */
 				  jcol = usub[i];
 				  jb = BlockNum( jcol );
 				  pc = PCOL( jb, grid );
-				  if ( mycol == pc ) { /* Block column ib in my process column */	
-					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);			  
-				  }	
+				  if ( mycol == pc ) { /* Block column ib in my process column */
+					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
+				  }
 				}  /* for i ... */
 				pc = PCOL( ib, grid );
-				if ( mycol == pc ) { /* Block column ib in my process column */						
+				if ( mycol == pc ) { /* Block column ib in my process column */
 					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],ib);
-				}						
-			}	
+				}
+			}
 		}
-		
-		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm);	
-		
+
+		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm);
+
 		for (lib=0;lib<k;++lib){
 			ib = myrow+lib*grid->nprow;  /* not sure */
 			if(ib<nsupers){
@@ -2684,18 +2684,18 @@ double *dense, *dense_col; /* SPA */
 				for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 				for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 				for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-				Root=-1; 
-				Iactive = 0;				
+				Root=-1;
+				Iactive = 0;
 
 				for (j=0;j<grid->npcol;++j){
 					if(ActiveFlag[j]!=3*nsupers){
 					jb = ActiveFlag[j];
 					pc = PCOL( jb, grid );
 					if(jb==ib)Root=pc;
-					if(mycol==pc)Iactive=1;		
-					}					
+					if(mycol==pc)Iactive=1;
+					}
 				}
-				
+
 				quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2);
 
 				if(Iactive==1){
@@ -2711,7 +2711,7 @@ double *dense, *dense_col; /* SPA */
 					if(rank_cnt>1){
 
 						for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-							ranks[ii] = PNUM( pr, ranks[ii], grid );		
+							ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 						// rseed=rand();
 						// rseed=1.0;
@@ -2719,7 +2719,7 @@ double *dense, *dense_col; /* SPA */
 
 						// if(ib==0){
 
-						URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');  	
+						URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
 						RdTree_SetTag(URtree_ptr[lib], RD_U,'d');
 						// }
 
@@ -2733,10 +2733,10 @@ double *dense, *dense_col; /* SPA */
 						// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 						// printf("\n");
 						}
-						// #endif		
+						// #endif
 					}
 				}
-			}						
+			}
 		}
 
 		SUPERLU_FREE(mod_bit);
@@ -2745,44 +2745,44 @@ double *dense, *dense_col; /* SPA */
 
 		SUPERLU_FREE(ActiveFlag);
 		SUPERLU_FREE(ActiveFlagAll);
-		SUPERLU_FREE(ranks);	
-		// SUPERLU_FREE(idxs);	
-		SUPERLU_FREE(SeedSTD_RD);	
+		SUPERLU_FREE(ranks);
+		// SUPERLU_FREE(idxs);
+		SUPERLU_FREE(SeedSTD_RD);
 		// for(i=0;i<nsupers;++i){
 			// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 		// }
-		// SUPERLU_FREE(nzrows);				
-		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll		
-			
+		// SUPERLU_FREE(nzrows);
+		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll
+
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
-#endif						
-			
+#endif
+
 	////////////////////////////////////////////////////////
- 
+
   /* Free the memory used for storing L and U */
   SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
   if (lsub != NULL)
-    SUPERLU_FREE(lsub);  
+    SUPERLU_FREE(lsub);
   if (usub != NULL)
-    SUPERLU_FREE(usub);  
-  
-  
+    SUPERLU_FREE(usub);
+
+
   SUPERLU_FREE(nnzToRecv);
   SUPERLU_FREE(ptrToRecv);
   SUPERLU_FREE(nnzToSend);
   SUPERLU_FREE(ptrToSend);
   SUPERLU_FREE(recvBuf);
-  
+
   Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-  Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;  
+  Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
   Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
   Llu->Linv_bc_ptr = Linv_bc_ptr;
-  Llu->Uinv_bc_ptr = Uinv_bc_ptr;  
+  Llu->Uinv_bc_ptr = Uinv_bc_ptr;
   Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
   Llu->Unzval_br_ptr = Unzval_br_ptr;
-  Llu->Unnz = Unnz;  
+  Llu->Unnz = Unnz;
   Llu->ToRecv = ToRecv;
   Llu->ToSendD = ToSendD;
   Llu->ToSendR = ToSendR;
@@ -2801,23 +2801,23 @@ double *dense, *dense_col; /* SPA */
   Llu->LBtree_ptr = LBtree_ptr;
   Llu->URtree_ptr = URtree_ptr;
   Llu->UBtree_ptr = UBtree_ptr;
-  Llu->Urbs = Urbs; 
-  Llu->Ucb_indptr = Ucb_indptr; 
-  Llu->Ucb_valptr = Ucb_valptr; 
-  
+  Llu->Urbs = Urbs;
+  Llu->Ucb_indptr = Ucb_indptr;
+  Llu->Ucb_valptr = Ucb_valptr;
+
 #if ( PRNTlevel>=1 )
   if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
 		     nLblocks, nUblocks);
 #endif
-  
+
   k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
   if ( !(Llu->mod_bit = intMalloc_dist(k)) )
       ABORT("Malloc fails for mod_bit[].");
 
   /* Find the maximum buffer size. */
-  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
 		MPI_MAX, grid->comm);
-  
+
 #if ( DEBUGlevel>=1 )
   /* Memory allocated but not freed:
      ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
@@ -2825,7 +2825,7 @@ double *dense, *dense_col; /* SPA */
   */
   CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
 #endif
-    
+
   return (- (memDist+memNLU));
 } /* ddist_psymbtonum */
 
diff -pruN 6.1.0+dfsg1-1/SRC/pdutil.c 6.1.1+dfsg1-1/SRC/pdutil.c
--- 6.1.0+dfsg1-1/SRC/pdutil.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pdutil.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief Several matrix utilities
  *
  * <pre>
@@ -43,11 +43,11 @@ int pdCompRow_loc_to_CompCol_global
     double *a_recv;  /* Buffer to receive the blocks of values. */
     double *a_buf;   /* Buffer to merge blocks into block columns. */
     int_t *itemp;
-    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
+    int_t *colptr_send; /* Buffer to redistribute the column pointers of the
 			   local block rows.
 			   Use n_loc+1 pointers for each block. */
     int_t *colptr_blk;  /* The column pointers for each block, after
-			   redistribution to the local block columns. 
+			   redistribution to the local block columns.
 			   Use n_loc+1 pointers for each block. */
     int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
     int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
@@ -152,8 +152,7 @@ int pdCompRow_loc_to_CompCol_global
     for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i];
 
     k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */
-    //    if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) )
-    if ( !(rowind_recv = (int_t *) intCalloc_dist(2*k)) )
+    if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) )
         ABORT("Malloc fails for rowind_recv[]");
     rowind_buf = rowind_recv + k;
     MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t,
@@ -166,7 +165,7 @@ int pdCompRow_loc_to_CompCol_global
                       a_recv, recvcnts, rdispls, MPI_DOUBLE,
                       grid->comm);
     }
-      
+
     /* Reset colptr_loc[] to point to the n_loc global columns. */
     colptr_loc[0] = 0;
     itemp = colptr_send;
@@ -180,7 +179,7 @@ int pdCompRow_loc_to_CompCol_global
 	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
     }
     itemp[n_loc] = colptr_loc[n_loc];
-      
+
     /* Merge blocks of row indices into columns of row indices. */
     for (i = 0; i < procs; ++i) {
         k = i * (n_loc + 1);
@@ -221,12 +220,12 @@ int pdCompRow_loc_to_CompCol_global
     MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
     for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
     GAstore->nnz = nnz;
-    
+
     if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
         ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
     if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
         ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
-      
+
     /* Allgatherv for row indices. */
     rdispls[0] = 0;
     for (i = 0; i < procs-1; ++i) {
@@ -235,12 +234,12 @@ int pdCompRow_loc_to_CompCol_global
     }
     itemp_32[procs-1] = itemp[procs-1];
     it = nnz_loc;
-    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
+    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind,
 		   itemp_32, rdispls, mpi_int_t, grid->comm);
     if ( need_value ) {
       if ( !(GAstore->nzval = (double *) doubleMalloc_dist (nnz)) )
           ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
-      MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval, 
+      MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval,
 		     itemp_32, rdispls, MPI_DOUBLE, grid->comm);
     } else GAstore->nzval = NULL;
 
@@ -251,7 +250,7 @@ int pdCompRow_loc_to_CompCol_global
         itemp_32[i] = n_locs[i];
     }
     itemp_32[procs-1] = n_locs[procs-1];
-    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
+    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr,
 		   itemp_32, rdispls, mpi_int_t, grid->comm);
 
     /* Recompute column pointers. */
@@ -373,7 +372,7 @@ int pdPermute_Dense_Matrix
 	++ptr_to_ibuf[p];
 	ptr_to_dbuf[p] += nrhs;
     }
-	  
+
     /* Transfer the (permuted) row indices and numerical values. */
     MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
 		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
@@ -401,7 +400,7 @@ int pdPermute_Dense_Matrix
 
 /*! \brief Initialize the data structure for the solution phase.
  */
-int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A, 
+int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A,
 	       int_t perm_r[], int_t perm_c[], int_t nrhs,
 	       LUstruct_t *LUstruct, gridinfo_t *grid,
 	       SOLVEstruct_t *SOLVEstruct)
@@ -415,7 +414,7 @@ int dSolveInit(superlu_dist_options_t *o
     fst_row = Astore->fst_row;
     m_loc = Astore->m_loc;
     procs = grid->nprow * grid->npcol;
-    
+
     if ( !(row_to_proc = intMalloc_dist(A->nrow)) )
 	ABORT("Malloc fails for row_to_proc[]");
     SOLVEstruct->row_to_proc = row_to_proc;
@@ -427,9 +426,9 @@ int dSolveInit(superlu_dist_options_t *o
     /* ------------------------------------------------------------
        EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION.
        SET UP THE MAPPING BETWEEN ROWS AND PROCESSES.
-       
+
        NOTE: For those processes that do not own any row, it must
-             must be set so that fst_row == A->nrow. 
+             must be set so that fst_row == A->nrow.
        ------------------------------------------------------------*/
     if ( !(itemp = intMalloc_dist(procs+1)) )
         ABORT("Malloc fails for itemp[]");
@@ -464,7 +463,7 @@ int dSolveInit(superlu_dist_options_t *o
 	    for (i = j ; i < k; ++i) row_to_proc[i] = p;
 	}
     }
-#endif    
+#endif
 
     get_diag_procs(A->ncol, LUstruct->Glu_persist, grid,
 		   &SOLVEstruct->num_diag_procs,
@@ -475,14 +474,14 @@ int dSolveInit(superlu_dist_options_t *o
     if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *)
 	   SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
         ABORT("Malloc fails for gstrs_comm[]");
-    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
+    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid,
 		 LUstruct->Glu_persist, SOLVEstruct);
 
     if ( !(SOLVEstruct->gsmv_comm = (pdgsmv_comm_t *)
            SUPERLU_MALLOC(sizeof(pdgsmv_comm_t))) )
         ABORT("Malloc fails for gsmv_comm[]");
     SOLVEstruct->A_colind_gsmv = NULL;
-    
+
     options->SolveInitialized = YES;
     return 0;
 } /* dSolveInit */
@@ -508,10 +507,10 @@ void dSolveFinalize(superlu_dist_options
     options->SolveInitialized = NO;
 } /* dSolveFinalize */
 
-/*! \brief Check the inf-norm of the error vector 
+/*! \brief Check the inf-norm of the error vector
  */
 void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx,
-		      double xtrue[], int_t ldxtrue, gridinfo_t *grid) 
+		      double xtrue[], int_t ldxtrue, gridinfo_t *grid)
 {
     double err, xnorm, temperr, tempxnorm;
     double *x_work, *xtrue_work;
diff -pruN 6.1.0+dfsg1-1/SRC/pzdistribute.c 6.1.1+dfsg1-1/SRC/pzdistribute.c
--- 6.1.0+dfsg1-1/SRC/pzdistribute.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzdistribute.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Re-distribute A on the 2D process mesh.
  * <pre>
  * -- Distributed SuperLU routine (version 2.3) --
@@ -19,7 +19,7 @@ at the top-level directory.
  */
 
 #include "superlu_zdefs.h"
-	  
+
 
 /*! \brief
  *
@@ -27,10 +27,10 @@ at the top-level directory.
  * Purpose
  * =======
  *   Re-distribute A on the 2D process mesh.
- * 
+ *
  * Arguments
  * =========
- * 
+ *
  * A      (input) SuperMatrix*
  *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
  *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
@@ -42,7 +42,7 @@ at the top-level directory.
  *
  * Glu_freeable (input) *Glu_freeable_t
  *        The global structure describing the graph of L and U.
- * 
+ *
  * grid   (input) gridinfo_t*
  *        The 2D process mesh.
  *
@@ -80,7 +80,7 @@ zReDistribute_A(SuperMatrix *A, ScalePer
     int    iam, it, p, procs, iam_g;
     MPI_Request *send_req;
     MPI_Status  status;
-    
+
 
     /* ------------------------------------------------------------
        INITIALIZATION.
@@ -97,8 +97,8 @@ zReDistribute_A(SuperMatrix *A, ScalePer
     m_loc = Astore->m_loc;
     fst_row = Astore->fst_row;
     nnzToRecv = intCalloc_dist(2*procs);
-    nnzToSend = nnzToRecv + procs;	
-	
+    nnzToSend = nnzToRecv + procs;
+
     /* ------------------------------------------------------------
        COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
        THEN ALLOCATE SPACE.
@@ -111,7 +111,7 @@ zReDistribute_A(SuperMatrix *A, ScalePer
 	    gbi = BlockNum( irow );
 	    gbj = BlockNum( jcol );
 	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-	    ++nnzToSend[p]; 
+	    ++nnzToSend[p];
 	}
     }
 
@@ -176,7 +176,7 @@ zReDistribute_A(SuperMatrix *A, ScalePer
 	  }
       }
     } /* if procs > 1 */
-      
+
     if ( !(*colptr = intCalloc_dist(n+1)) )
         ABORT("Malloc fails for *colptr[].");
 
@@ -199,7 +199,7 @@ zReDistribute_A(SuperMatrix *A, ScalePer
 	        ia_send[p][k] = irow;
 	        ia_send[p][k + nnzToSend[p]] = jcol;
 		aij_send[p][k] = nzval_a[j];
-		++ptr_to_send[p]; 
+		++ptr_to_send[p];
 	    } else {          /* local */
 	        ia[nnz_loc] = irow;
 	        ja[nnz_loc] = jcol;
@@ -221,14 +221,14 @@ zReDistribute_A(SuperMatrix *A, ScalePer
 		       p, iam, grid->comm, &send_req[p] );
 	    it = nnzToSend[p];
 	    MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX,
-	               p, iam+procs, grid->comm, &send_req[procs+p] ); 
+	               p, iam+procs, grid->comm, &send_req[procs+p] );
 	}
     }
 
     for (p = 0; p < procs; ++p) {
         if ( p != iam ) {
 	    it = 2*nnzToRecv[p];
-	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
+	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
 	    it = nnzToRecv[p];
             MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs,
 		      grid->comm, &status );
@@ -239,7 +239,7 @@ zReDistribute_A(SuperMatrix *A, ScalePer
 	        ja[nnz_loc] = jcol;
 		aij[nnz_loc] = dtemp[i];
 		++nnz_loc;
-		++(*colptr)[jcol]; /* Count nonzeros in each column */ 
+		++(*colptr)[jcol]; /* Count nonzeros in each column */
 	    }
 	}
     }
@@ -291,7 +291,7 @@ zReDistribute_A(SuperMatrix *A, ScalePer
 	jsize = (*colptr)[j];
 	(*colptr)[j] = k;
     }
-    
+
     /* Copy the triplets into the column oriented storage */
     for (i = 0; i < nnz_loc; ++i) {
 	j = ja[i];
@@ -313,7 +313,7 @@ zReDistribute_A(SuperMatrix *A, ScalePer
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Exit zReDistribute_A()");
 #endif
- 
+
     return 0;
 } /* zReDistribute_A */
 
@@ -331,10 +331,10 @@ pzdistribute(fact_t fact, int_t n, Super
  * Purpose
  * =======
  *   Distribute the matrix onto the 2D process mesh.
- * 
+ *
  * Arguments
  * =========
- * 
+ *
  * fact (input) fact_t
  *        Specifies whether or not the L and U structures will be re-used.
  *        = SamePattern_SameRowPerm: L and U structures are input, and
@@ -355,7 +355,7 @@ pzdistribute(fact_t fact, int_t n, Super
  *
  * Glu_freeable (input) *Glu_freeable_t
  *        The global structure describing the graph of L and U.
- * 
+ *
  * LUstruct (input) LUstruct_t*
  *        Data structures for L and U factors.
  *
@@ -370,7 +370,7 @@ pzdistribute(fact_t fact, int_t n, Super
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
-    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, 
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1,
           len, len1, nsupc;
 	int_t lib;  /* local block row number */
 	int_t nlb;  /* local block rows*/
@@ -379,39 +379,39 @@ pzdistribute(fact_t fact, int_t n, Super
     int_t nrbu; /* number of U blocks in current block column */
     int_t gb;   /* global block number; 0 < gb <= nsuper */
     int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-	int_t ub,gik,iklrow,fnz;    
+	int_t ub,gik,iklrow,fnz;
 	int iam, jbrow, kcol, krow, mycol, myrow, pc, pr;
     int_t mybufmax[NBUFFERS];
     NRformat_loc *Astore;
     doublecomplex *a;
     int_t *asub, *xa;
-    int_t *xa_begin, *xa_end;							 
+    int_t *xa_begin, *xa_end;
     int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-    int_t *supno = Glu_persist->supno;   
+    int_t *supno = Glu_persist->supno;
     int_t *lsub, *xlsub, *usub, *usub1, *xusub;
     int_t nsupers;
     int_t next_lind;      /* next available position in index[*] */
     int_t next_lval;      /* next available position in nzval[*] */
     int_t *index;         /* indices consist of headers and row subscripts */
-	int_t *index_srt;         /* indices consist of headers and row subscripts */    
+	int_t *index_srt;         /* indices consist of headers and row subscripts */
 	int   *index1;        /* temporary pointer to array of int */
     doublecomplex *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
     doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-	int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */		    
-	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */	
+	int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */
 	doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
     int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
 
 	BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
 	RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
 	BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */	
+	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
 	int msgsize;
 
     int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
     Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */  		
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
     /*-- Counts to be used in factorization. --*/
     int  *ToRecv, *ToSendD, **ToSendR;
 
@@ -427,7 +427,7 @@ pzdistribute(fact_t fact, int_t n, Super
     int_t  **bsendx_plist; /* Column process list to send down Xk.   */
     int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
     int_t  nbsendx = 0;    /* Number of Xk I will send               */
-    int_t  *ilsum;         /* starting position of each supernode in 
+    int_t  *ilsum;         /* starting position of each supernode in
 			      the full array (local)                 */
 
     /*-- Auxiliary arrays; freed on return --*/
@@ -447,30 +447,30 @@ pzdistribute(fact_t fact, int_t n, Super
 	int_t *idxs;
 	int_t **nzrows;
 	double rseed;
-	int rank_cnt,rank_cnt_ref,Root;    
+	int rank_cnt,rank_cnt_ref,Root;
 	doublecomplex *dense, *dense_col; /* SPA */
     doublecomplex zero = {0.0, 0.0};
     int_t ldaspa;     /* LDA of SPA */
     int_t iword, dword;
     float mem_use = 0.0;
     float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
-	
+
     int_t *mod_bit;
     int_t *frecv, *brecv, *lloc;
     doublecomplex **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     doublecomplex **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-    double *SeedSTD_BC,*SeedSTD_RD;				 
+    double *SeedSTD_BC,*SeedSTD_RD;
     int_t idx_indx,idx_lusup;
     int_t nbrow;
     int_t  ik, il, lk, rel, knsupc, idx_r;
     int_t  lptr1_tmp, idx_i, idx_v,m, uu;
     int_t nub;
-    int tag;	
-	
+    int tag;
+
 #if ( PRNTlevel>=1 )
     int_t nLblocks = 0, nUblocks = 0;
 #endif
-#if ( PROFlevel>=1 ) 
+#if ( PROFlevel>=1 )
     double t, t_u, t_l;
     int_t u_blks;
 #endif
@@ -485,7 +485,7 @@ pzdistribute(fact_t fact, int_t n, Super
 
 //#if ( PRNTlevel>=1 )
     iword = sizeof(int_t);
-    dword = sizeof(doublecomplex);					
+    dword = sizeof(doublecomplex);
 //#endif
 
 #if ( DEBUGlevel>=1 )
@@ -521,11 +521,11 @@ pzdistribute(fact_t fact, int_t n, Super
 	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
 	    ABORT("Malloc fails for Urb_indptr[].");
 	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-	Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr;											  
+	Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr;
 	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
 	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
 	Unzval_br_ptr = Llu->Unzval_br_ptr;
-	Unnz = Llu->Unnz;	
+	Unnz = Llu->Unnz;
 
 	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
 
@@ -647,7 +647,7 @@ pzdistribute(fact_t fact, int_t n, Super
 	xlsub = Glu_freeable->xlsub;
 	usub = Glu_freeable->usub;    /* compressed U subscripts */
 	xusub = Glu_freeable->xusub;
-    
+
 	if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) )
 	    ABORT("Malloc fails for ToRecv[].");
 	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
@@ -666,12 +666,12 @@ pzdistribute(fact_t fact, int_t n, Super
 	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
 
 	/* Pointers to the beginning of each block row of U. */
-	if ( !(Unzval_br_ptr = 
+	if ( !(Unzval_br_ptr =
               (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
 	    ABORT("Malloc fails for Unzval_br_ptr[].");
 	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
-	
+
 	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
 	    ABORT("Malloc fails for ToSendD[].");
 	for (i = 0; i < k; ++i) ToSendD[i] = NO;
@@ -704,7 +704,7 @@ pzdistribute(fact_t fact, int_t n, Super
 		ilsum[lb + 1] = ilsum[lb] + i;
 	    }
 	}
-	
+
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_();
 #endif
@@ -712,7 +712,7 @@ pzdistribute(fact_t fact, int_t n, Super
 	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
 	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
 	   ------------------------------------------------------------*/
-	
+
 	/* Loop through each supernode column. */
 	for (jb = 0; jb < nsupers; ++jb) {
 	    pc = PCOL( jb, grid );
@@ -749,7 +749,7 @@ pzdistribute(fact_t fact, int_t n, Super
 		} /* for i ... */
 	    } /* for j ... */
 	} /* for jb ... */
-	
+
 	/* Set up the initial pointers for each block row in U. */
 	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
 	for (lb = 0; lb < nrbu; ++lb) {
@@ -813,34 +813,34 @@ pzdistribute(fact_t fact, int_t n, Super
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
 
 	/* Pointers to the beginning of each block column of L. */
-	if ( !(Lnzval_bc_ptr = 
+	if ( !(Lnzval_bc_ptr =
               (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
 	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
 	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
 	Lrowind_bc_ptr[k-1] = NULL;
 
-	if ( !(Lindval_loc_bc_ptr = 
+	if ( !(Lindval_loc_bc_ptr =
 				(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 		ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
 	Lindval_loc_bc_ptr[k-1] = NULL;
 
-	if ( !(Linv_bc_ptr = 
+	if ( !(Linv_bc_ptr =
 				(doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) {
 		fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
-	}  
-	if ( !(Uinv_bc_ptr = 
+	}
+	if ( !(Uinv_bc_ptr =
 				(doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) {
 		fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
-	}  
+	}
 	Linv_bc_ptr[k-1] = NULL;
-	Uinv_bc_ptr[k-1] = NULL;	
-	
-	if ( !(Unnz = 
+	Uinv_bc_ptr[k-1] = NULL;
+
+	if ( !(Unnz =
 			(int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
 	ABORT("Malloc fails for Unnz[].");
-		
-	
+
+
 	/* These lists of processes will be used for triangular solves. */
 	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
 	    ABORT("Malloc fails for fsendx_plist[].");
@@ -860,7 +860,7 @@ pzdistribute(fact_t fact, int_t n, Super
 	/* -------------------------------------------------------------- */
 	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
 	memTRS += k*sizeof(int_t*) + 2.0*k*sizeof(double*) + k*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
-	
+
 	/*------------------------------------------------------------
 	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
 	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
@@ -872,7 +872,7 @@ pzdistribute(fact_t fact, int_t n, Super
 		fsupc = FstBlockC( jb );
 		nsupc = SuperSize( jb );
 		ljb = LBj( jb, grid ); /* Local block number */
-		
+
 		/* Scatter A into SPA. */
 		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
 		    for (i = xa[j]; i < xa[j+1]; ++i) {
@@ -917,7 +917,7 @@ pzdistribute(fact_t fact, int_t n, Super
 			    index = Ufstnz_br_ptr[lb];
 			    uval = Unzval_br_ptr[lb];
 			    fsupc1 = FstBlockC( gb+1 );
-			    if (rb_marker[lb] <= jb) { /* First time see 
+			    if (rb_marker[lb] <= jb) { /* First time see
 							  the block       */
 				rb_marker[lb] = jb + 1;
 				Urb_indptr[lb] = Urb_fstnz[lb];;
@@ -958,7 +958,7 @@ pzdistribute(fact_t fact, int_t n, Super
 #if ( PROFlevel>=1 )
 		t_u += SuperLU_timer_() - t;
 		t = SuperLU_timer_();
-#endif		
+#endif
 		/*------------------------------------------------
 		 * SET UP L BLOCKS.
 		 *------------------------------------------------*/
@@ -1001,15 +1001,15 @@ pzdistribute(fact_t fact, int_t n, Super
 		} /* for i ... */
 
 		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-		    /* Set up the initial pointers for each block in 
+		    /* Set up the initial pointers for each block in
 		       index[] and nzval[]. */
 		    /* Add room for descriptors */
 		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-		    if ( !(index = intMalloc_dist(len1)) ) 
+		    if ( !(index = intMalloc_dist(len1)) )
 			ABORT("Malloc fails for index[]");
 		    if (!(lusup = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex))))
 			ABORT("Malloc fails for lusup[]");
-		    if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) ) 
+		    if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) )
 			ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
   		    if (!(Linv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
 			ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
@@ -1018,7 +1018,7 @@ pzdistribute(fact_t fact, int_t n, Super
 		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
 		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
 		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-	  	    memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]			
+	  	    memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
 		    index[0] = nrbl;  /* Number of row blocks */
 		    index[1] = len;   /* LDA of the nzval[] */
 		    next_lind = BC_HEADER;
@@ -1029,10 +1029,10 @@ pzdistribute(fact_t fact, int_t n, Super
 			len = Lrb_length[lb];
 			Lindval_loc_bc_ptr[ljb][k] = lb;
 			Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind;
-			Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;																	 
+			Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;
 			Lrb_length[lb] = 0;  /* Reset vector of block length */
 			index[next_lind++] = gb; /* Descriptor */
-			index[next_lind++] = len; 
+			index[next_lind++] = len;
 			Lrb_indptr[lb] = next_lind;
 			Lrb_valptr[lb] = next_lval;
 			next_lind += len;
@@ -1058,9 +1058,9 @@ pzdistribute(fact_t fact, int_t n, Super
 			    }
 			}
 		    } /* for i ... */
-			
+
 		    Lrowind_bc_ptr[ljb] = index;
-		    Lnzval_bc_ptr[ljb] = lusup; 
+		    Lnzval_bc_ptr[ljb] = lusup;
 
 			/* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb]
                            and Lnzval_bc_ptr[ljb] here.  */
@@ -1070,15 +1070,15 @@ pzdistribute(fact_t fact, int_t n, Super
 					uu=nrbl-2;
 					lloc = &Lindval_loc_bc_ptr[ljb][1];
 				}else{
-					uu=nrbl-1;	
+					uu=nrbl-1;
 					lloc = Lindval_loc_bc_ptr[ljb];
-				}	
-				quickSortM(lloc,0,uu,nrbl,0,3);	
+				}
+				quickSortM(lloc,0,uu,nrbl,0,3);
 			}
 
 
-			if ( !(index_srt = intMalloc_dist(len1)) ) 
-				ABORT("Malloc fails for index_srt[]");				
+			if ( !(index_srt = intMalloc_dist(len1)) )
+				ABORT("Malloc fails for index_srt[]");
 			if (!(lusup_srt = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex))))
 				ABORT("Malloc fails for lusup_srt[]");
 
@@ -1093,26 +1093,26 @@ pzdistribute(fact_t fact, int_t n, Super
 					index_srt[idx_indx++] = index[Lindval_loc_bc_ptr[ljb][i+nrbl]+jj];
 				}
 
-				Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow; 
+				Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow;
 
 				for (jj=0;jj<nbrow;jj++){
 					k=idx_lusup;
 					k1=Lindval_loc_bc_ptr[ljb][i+nrbl*2]+jj;
-					for (j = 0; j < nsupc; ++j) {				
+					for (j = 0; j < nsupc; ++j) {
 						lusup_srt[k] = lusup[k1];
 						k += len;
 						k1 += len;
-					}	
+					}
 					idx_lusup++;
-				}				
-				Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;	
+				}
+				Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;
 			}
 
 			SUPERLU_FREE(lusup);
 			SUPERLU_FREE(index);
 
 			Lrowind_bc_ptr[ljb] = index_srt;
-			Lnzval_bc_ptr[ljb] = lusup_srt; 			
+			Lnzval_bc_ptr[ljb] = lusup_srt;
 
 			// if(ljb==0)
 			// for (jj=0;jj<nrbl*3;jj++){
@@ -1121,15 +1121,15 @@ pzdistribute(fact_t fact, int_t n, Super
 			// }
 			// for (jj=0;jj<nrbl;jj++){
 			// printf("iam %5d Lindval %5d\n",iam, index[Lindval_loc_bc_ptr[ljb][jj+nrbl]]);
-			// fflush(stdout);			
+			// fflush(stdout);
 
-			// }	
+			// }
 		} else {
 		    Lrowind_bc_ptr[ljb] = NULL;
 		    Lnzval_bc_ptr[ljb] = NULL;
 			Linv_bc_ptr[ljb] = NULL;
 			Uinv_bc_ptr[ljb] = NULL;
-			Lindval_loc_bc_ptr[ljb] = NULL;			
+			Lindval_loc_bc_ptr[ljb] = NULL;
 		} /* if nrbl ... */
 #if ( PROFlevel>=1 )
 		t_l += SuperLU_timer_() - t;
@@ -1139,7 +1139,7 @@ pzdistribute(fact_t fact, int_t n, Super
 	} /* for jb ... */
 
 	/////////////////////////////////////////////////////////////////
-	
+
 	/* Set up additional pointers for the index and value arrays of U.
 	   nub is the number of local block columns. */
 	nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */
@@ -1153,7 +1153,7 @@ pzdistribute(fact_t fact, int_t n, Super
 		ABORT("Malloc fails for Ucb_valptr[]");
 	nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
 
-	/* Count number of row blocks in a block column. 
+	/* Count number of row blocks in a block column.
 	   One pass of the skeleton graph of U. */
 	for (lk = 0; lk < nlb; ++lk) {
 		usub1 = Ufstnz_br_ptr[lk];
@@ -1192,20 +1192,20 @@ pzdistribute(fact_t fact, int_t n, Super
 
 				Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
 				Ucb_valptr[ljb][Urbs1[ljb]] = j;
-				
+
 				++Urbs1[ljb];
 				j += usub1[i+1];
 				i += UB_DESCRIPTOR + SuperSize( k );
 			}
 		}
-	}				
-	
+	}
+
 
-/* Count the nnzs per block column */	
+/* Count the nnzs per block column */
 	for (lb = 0; lb < nub; ++lb) {
 		Unnz[lb] = 0;
 		k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
-		knsupc = SuperSize( k );	
+		knsupc = SuperSize( k );
 		for (ub = 0; ub < Urbs[lb]; ++ub) {
 			ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */
 			i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */
@@ -1219,41 +1219,41 @@ pzdistribute(fact_t fact, int_t n, Super
 				}
 			} /* for jj ... */
 		}
-	}			
-	
+	}
+
 	/////////////////////////////////////////////////////////////////
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif				
+#endif
 	/* construct the Bcast tree for L ... */
 
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
 	if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 		ABORT("Malloc fails for LBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_BC[].");	
+		ABORT("Malloc fails for SeedSTD_BC[].");
+
 
-		
 	for (i=0;i<k;i++){
-		SeedSTD_BC[i]=rand();		
+		SeedSTD_BC[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 	for (ljb = 0; ljb <k ; ++ljb) {
 		LBtree_ptr[ljb]=NULL;
-	}			
-	
+	}
+
 
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
-	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll		
-	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;	
+		ABORT("Calloc fails for ActiveFlag[].");
+	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -1269,10 +1269,10 @@ pzdistribute(fact_t fact, int_t n, Super
 			ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb);
 		} /* for j ... */
 		}
-	}			
-	
+	}
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
-		
+
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
 		pc = PCOL( jb, grid );
@@ -1281,19 +1281,19 @@ pzdistribute(fact_t fact, int_t n, Super
 		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-		Root=-1; 
-		Iactive = 0;				
+		Root=-1;
+		Iactive = 0;
 		for (j=0;j<grid->nprow;++j){
 			if(ActiveFlag[j]!=3*nsupers){
 			gb = ActiveFlag[j];
 			pr = PROW( gb, grid );
 			if(gb==jb)Root=pr;
-			if(myrow==pr)Iactive=1;		
-			}					
+			if(myrow==pr)Iactive=1;
+			}
 		}
-		
 
-		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);	
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);
 
 		if(Iactive==1){
 			// printf("jb %5d damn\n",jb);
@@ -1306,7 +1306,7 @@ pzdistribute(fact_t fact, int_t n, Super
 					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 					++rank_cnt;
 				}
-			}		
+			}
 
 			if(rank_cnt>1){
 
@@ -1316,7 +1316,7 @@ pzdistribute(fact_t fact, int_t n, Super
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');  	
+				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
 				BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z');
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
@@ -1327,15 +1327,15 @@ pzdistribute(fact_t fact, int_t n, Super
 				// fflush(stdout);
 				// }
 
-				// #if ( PRNTlevel>=1 )		
+				// #if ( PRNTlevel>=1 )
 				if(Root==myrow){
 					rank_cnt_ref=1;
 					for (j = 0; j < grid->nprow; ++j) {
-						if ( fsendx_plist[ljb][j] != EMPTY ) {	
-							++rank_cnt_ref;		
+						if ( fsendx_plist[ljb][j] != EMPTY ) {
+							++rank_cnt_ref;
 						}
 					}
-					assert(rank_cnt==rank_cnt_ref);		
+					assert(rank_cnt==rank_cnt_ref);
 
 					// printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt);
 
@@ -1344,27 +1344,27 @@ pzdistribute(fact_t fact, int_t n, Super
 					// // printf("\n");
 				}
 				// #endif
-			}	
+			}
 		}
 		}
 	}
 
-	
+
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
 	SUPERLU_FREE(ranks);
 	SUPERLU_FREE(SeedSTD_BC);
-	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll	
-	
+	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
-#endif			
+#endif
 
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif			
+#endif
 	/* construct the Reduce tree for L ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1393,24 +1393,24 @@ if ( !iam) printf(".. Construct Bcast tr
 	if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 		ABORT("Malloc fails for LRtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 
 	// if ( !(idxs = intCalloc_dist(nsupers)) )
-		// ABORT("Calloc fails for idxs[].");	
+		// ABORT("Calloc fails for idxs[].");
 
 	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 		// ABORT("Malloc fails for nzrows[].");
 
 	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_RD[].");	
+		ABORT("Malloc fails for SeedSTD_RD[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_RD[i]=rand();		
+		SeedSTD_RD[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 
 	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
@@ -1436,11 +1436,11 @@ if ( !iam) printf(".. Construct Bcast tr
 		LRtree_ptr[lib]=NULL;
 	}
 
-	
+
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-	memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll						
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
+	memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
 	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		fsupc = FstBlockC( jb );
 		pc = PCOL( jb, grid );
@@ -1455,7 +1455,7 @@ if ( !iam) printf(".. Construct Bcast tr
 		}
 	}
 
-	
+
 	for (lib=0;lib<k;++lib){
 		ib = myrow+lib*grid->nprow;  /* not sure */
 		if(ib<nsupers){
@@ -1463,19 +1463,19 @@ if ( !iam) printf(".. Construct Bcast tr
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 
 			for (j=0;j<grid->npcol;++j){
 				if(ActiveFlag[j]!=-3*nsupers){
 				jb = ActiveFlag[j];
 				pc = PCOL( jb, grid );
 				if(jb==ib)Root=pc;
-				if(mycol==pc)Iactive=1;		
-				}					
+				if(mycol==pc)Iactive=1;
+				}
 			}
-		
-		
+
+
 			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2);
 
 			if(Iactive==1){
@@ -1491,7 +1491,7 @@ if ( !iam) printf(".. Construct Bcast tr
 				if(rank_cnt>1){
 
 					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-						ranks[ii] = PNUM( pr, ranks[ii], grid );		
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 					// rseed=rand();
 					// rseed=1.0;
@@ -1499,7 +1499,7 @@ if ( !iam) printf(".. Construct Bcast tr
 
 					// if(ib==0){
 
-					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');  	
+					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
 					RdTree_SetTag(LRtree_ptr[lib], RD_L,'z');
 					// }
 
@@ -1511,7 +1511,7 @@ if ( !iam) printf(".. Construct Bcast tr
 					// if(iam==15 || iam==3){
 					// printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'z'));
 					// fflush(stdout);
-					// }		
+					// }
 
 
 					// #if ( PRNTlevel>=1 )
@@ -1522,10 +1522,10 @@ if ( !iam) printf(".. Construct Bcast tr
 					// // // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 					// // printf("\n");
 					// }
-					// #endif		
+					// #endif
 				}
-			}				
-		}	
+			}
+		}
 	}
 
 	SUPERLU_FREE(mod_bit);
@@ -1534,24 +1534,24 @@ if ( !iam) printf(".. Construct Bcast tr
 
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);	
-	// SUPERLU_FREE(idxs);	 
-	SUPERLU_FREE(SeedSTD_RD);	
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
 	// for(i=0;i<nsupers;++i){
 		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 	// }
 	// SUPERLU_FREE(nzrows);
-	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll	
+	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll
 		////////////////////////////////////////////////////////
 
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_();
-#endif	
+#endif
 
 	/* construct the Bcast tree for U ... */
 
@@ -1559,28 +1559,28 @@ if ( !iam) printf(".. Construct Reduce t
 	if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 		ABORT("Malloc fails for UBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_BC[].");	
+		ABORT("Malloc fails for SeedSTD_BC[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_BC[i]=rand();		
+		SeedSTD_BC[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 
 	for (ljb = 0; ljb <k ; ++ljb) {
 		UBtree_ptr[ljb]=NULL;
-	}	
+	}
 
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll	
-	
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
+	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -1597,21 +1597,21 @@ if ( !iam) printf(".. Construct Reduce t
 				pr = PROW( gb, grid );
 				ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb);
 			// printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers);
-			// fflush(stdout);								
+			// fflush(stdout);
 				//if(gb==jb)Root=pr;
 			}
-			
-			
+
+
 		}
 		pr = PROW( jb, grid ); // take care of diagonal node stored as L
 		// printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]);
 		// fflush(stdout);
-		ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb);	
+		ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb);
 		}
-	}	
-		
-		
-		
+	}
+
+
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -1622,18 +1622,18 @@ if ( !iam) printf(".. Construct Reduce t
 		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-		Root=-1; 
-		Iactive = 0;				
+		Root=-1;
+		Iactive = 0;
 		for (j=0;j<grid->nprow;++j){
 			if(ActiveFlag[j]!=-3*nsupers){
 			gb = ActiveFlag[j];
 			pr = PROW( gb, grid );
 			if(gb==jb)Root=pr;
-			if(myrow==pr)Iactive=1;		
+			if(myrow==pr)Iactive=1;
 			}
-		}						
-		
-		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);	
+		}
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);
 	// printf("jb: %5d Iactive %5d\n",jb,Iactive);
 	// fflush(stdout);
 		if(Iactive==1){
@@ -1647,7 +1647,7 @@ if ( !iam) printf(".. Construct Reduce t
 					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 					++rank_cnt;
 				}
-			}		
+			}
 	// printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt);
 	// fflush(stdout);
 			if(rank_cnt>1){
@@ -1657,43 +1657,43 @@ if ( !iam) printf(".. Construct Reduce t
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');  	
+				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
 				BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z');
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 				// fflush(stdout);
-				
+
 				if(Root==myrow){
 				rank_cnt_ref=1;
 				for (j = 0; j < grid->nprow; ++j) {
 					// printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow);
 					// fflush(stdout);
-					if ( bsendx_plist[ljb][j] != EMPTY ) {	
-						++rank_cnt_ref;		
+					if ( bsendx_plist[ljb][j] != EMPTY ) {
+						++rank_cnt_ref;
 					}
 				}
 				// printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref);
-				// fflush(stdout);								
-				assert(rank_cnt==rank_cnt_ref);		
-				}						
+				// fflush(stdout);
+				assert(rank_cnt==rank_cnt_ref);
+				}
 			}
 		}
 		}
-	}	
+	}
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);				
-	SUPERLU_FREE(SeedSTD_BC);				
+	SUPERLU_FREE(ranks);
+	SUPERLU_FREE(SeedSTD_BC);
 	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
-	
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif					
+#endif
 	/* construct the Reduce tree for U ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1722,46 +1722,46 @@ if ( !iam) printf(".. Construct Bcast tr
 	if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 		ABORT("Malloc fails for URtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 
 	// if ( !(idxs = intCalloc_dist(nsupers)) )
-		// ABORT("Calloc fails for idxs[].");	
+		// ABORT("Calloc fails for idxs[].");
 
 	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 		// ABORT("Malloc fails for nzrows[].");
 
 	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_RD[].");	
+		ABORT("Malloc fails for SeedSTD_RD[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_RD[i]=rand();		
+		SeedSTD_RD[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 
 	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		// fsupc = FstBlockC( jb );
-		// len=0;  
+		// len=0;
 		// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 			// istart = xusub[j];
 			// /* NOTE: Only the first nonzero index of the segment
 			   // is stored in usub[]. */
-			// len +=  xusub[j+1] - xusub[j];  
-		// }	
-				
+			// len +=  xusub[j+1] - xusub[j];
+		// }
+
 		// idxs[jb] = len-1;
 
 		// if(len>0){
 			// if ( !(nzrows[jb] = intMalloc_dist(len)) )
 				// ABORT("Malloc fails for nzrows[jb]");
-			
+
 			// fsupc = FstBlockC( jb );
-			
-			// len=0; 
-			
+
+			// len=0;
+
 			// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 				// istart = xusub[j];
 				// /* NOTE: Only the first nonzero index of the segment
@@ -1771,29 +1771,29 @@ if ( !iam) printf(".. Construct Bcast tr
 					// nzrows[jb][len]=irow;
 					// len++;
 				// }
-			// }	
+			// }
 			// quickSort(nzrows[jb],0,len-1,0);
 		// }
 		// else{
 			// nzrows[jb] = NULL;
 		// }
 	// }
-	
+
 
 	for (lib = 0; lib <k ; ++lib) {
 		URtree_ptr[lib]=NULL;
 	}
 
-	
+
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;	
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;
 	memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
-	
+
 	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		fsupc = FstBlockC( jb );
 		pc = PCOL( jb, grid );
-		
+
 		fsupc = FstBlockC( jb );
 		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 			istart = xusub[j];
@@ -1806,17 +1806,17 @@ if ( !iam) printf(".. Construct Bcast tr
 				if ( myrow == pr ) { /* Block row ib in my process row */
 					lib = LBi( ib, grid ); /* Local block number */
 					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
-				}						
+				}
 			}
 		}
-		
+
 		pr = PROW( jb, grid );
 		if ( myrow == pr ) { /* Block row ib in my process row */
 			lib = LBi( jb, grid ); /* Local block number */
 			ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
-		}					
+		}
 	}
-		
+
 
 	for (lib=0;lib<k;++lib){
 		ib = myrow+lib*grid->nprow;  /* not sure */
@@ -1825,18 +1825,18 @@ if ( !iam) printf(".. Construct Bcast tr
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 
 			for (j=0;j<grid->npcol;++j){
 				if(ActiveFlag[j]!=3*nsupers){
 				jb = ActiveFlag[j];
 				pc = PCOL( jb, grid );
 				if(jb==ib)Root=pc;
-				if(mycol==pc)Iactive=1;		
-				}					
+				if(mycol==pc)Iactive=1;
+				}
 			}
-			
+
 			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2);
 
 			if(Iactive==1){
@@ -1852,7 +1852,7 @@ if ( !iam) printf(".. Construct Bcast tr
 				if(rank_cnt>1){
 
 					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-						ranks[ii] = PNUM( pr, ranks[ii], grid );		
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 					// rseed=rand();
 					// rseed=1.0;
@@ -1860,7 +1860,7 @@ if ( !iam) printf(".. Construct Bcast tr
 
 					// if(ib==0){
 
-					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');  	
+					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
 					RdTree_SetTag(URtree_ptr[lib], RD_U,'z');
 					// }
 
@@ -1874,10 +1874,10 @@ if ( !iam) printf(".. Construct Bcast tr
 					// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 					// printf("\n");
 					}
-					// #endif		
+					// #endif
 				}
 			}
-		}						
+		}
 	}
 	SUPERLU_FREE(mod_bit);
 	SUPERLU_FREE(brecv);
@@ -1885,26 +1885,26 @@ if ( !iam) printf(".. Construct Bcast tr
 
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);	
-	// SUPERLU_FREE(idxs);	
-	SUPERLU_FREE(SeedSTD_RD);	
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
 	// for(i=0;i<nsupers;++i){
 		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 	// }
-	// SUPERLU_FREE(nzrows);				
-		
-	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll			
-		
+	// SUPERLU_FREE(nzrows);
+
+	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
-#endif						
-		
+#endif
+
 	////////////////////////////////////////////////////////
 
-	
+
 	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;  
+	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
 	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
 	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
 	Llu->Unzval_br_ptr = Unzval_br_ptr;
@@ -1922,16 +1922,16 @@ if ( !iam) printf(".. Construct Reduce t
 	Llu->nbsendx = nbsendx;
 	Llu->ilsum = ilsum;
 	Llu->ldalsum = ldaspa;
-	
+
 	Llu->LRtree_ptr = LRtree_ptr;
 	Llu->LBtree_ptr = LBtree_ptr;
 	Llu->URtree_ptr = URtree_ptr;
 	Llu->UBtree_ptr = UBtree_ptr;
 	Llu->Linv_bc_ptr = Linv_bc_ptr;
-	Llu->Uinv_bc_ptr = Uinv_bc_ptr;	
-	Llu->Urbs = Urbs; 
-	Llu->Ucb_indptr = Ucb_indptr; 
-	Llu->Ucb_valptr = Ucb_valptr; 
+	Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+	Llu->Urbs = Urbs;
+	Llu->Ucb_indptr = Ucb_indptr;
+	Llu->Ucb_valptr = Ucb_valptr;
 
 
 #if ( PRNTlevel>=1 )
@@ -1950,7 +1950,7 @@ if ( !iam) printf(".. Construct Reduce t
 	SUPERLU_FREE(dense);
 
 	/* Find the maximum buffer size. */
-	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
 		      MPI_MAX, grid->comm);
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1977,7 +1977,7 @@ if ( !iam) printf(".. Construct Reduce t
        ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
     CHECK_MALLOC(iam, "Exit pzdistribute()");
 #endif
-    
+
     return (mem_use+memTRS);
 
 } /* PZDISTRIBUTE */
diff -pruN 6.1.0+dfsg1-1/SRC/pzGetDiagU.c 6.1.1+dfsg1-1/SRC/pzGetDiagU.c
--- 6.1.0+dfsg1-1/SRC/pzGetDiagU.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzGetDiagU.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 /*! @file p@(pre)GetDiagU.c
- * \brief Extracts the main diagonal of matrix U 
+ * \brief Extracts the main diagonal of matrix U
  *
  * <pre>
  * -- Auxiliary routine in distributed SuperLU (version 5.1.0) --
@@ -30,7 +30,7 @@ at the top-level directory.
  * =======
  *
  * GetDiagU extracts the main diagonal of matrix U of the LU factorization.
- *  
+ *
  * Arguments
  * =========
  *
diff -pruN 6.1.0+dfsg1-1/SRC/pzgsequ.c 6.1.1+dfsg1-1/SRC/pzgsequ.c
--- 6.1.0+dfsg1-1/SRC/pzgsequ.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgsequ.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Computes row and column scalings
  *
  * File name:	pzgsequ.c
@@ -20,64 +20,64 @@ at the top-level directory.
 
 /*! \brief
 
- <pre>    
-    Purpose   
-    =======   
+ <pre>
+    Purpose
+    =======
 
-    PZGSEQU computes row and column scalings intended to equilibrate an   
+    PZGSEQU computes row and column scalings intended to equilibrate an
     M-by-N sparse matrix A and reduce its condition number. R returns the row
-    scale factors and C the column scale factors, chosen to try to make   
-    the largest element in each row and column of the matrix B with   
-    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
-
-    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
-    number and BIGNUM = largest safe number.  Use of these scaling   
-    factors is not guaranteed to reduce the condition number of A but   
-    works well in practice.   
+    scale factors and C the column scale factors, chosen to try to make
+    the largest element in each row and column of the matrix B with
+    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+
+    R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+    number and BIGNUM = largest safe number.  Use of these scaling
+    factors is not guaranteed to reduce the condition number of A but
+    works well in practice.
 
     See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
+
+    Arguments
+    =========
 
     A       (input) SuperMatrix*
             The matrix of dimension (A->nrow, A->ncol) whose equilibration
             factors are to be computed. The type of A can be:
             Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
-	    
+
     R       (output) double*, size A->nrow
-            If INFO = 0 or INFO > M, R contains the row scale factors   
+            If INFO = 0 or INFO > M, R contains the row scale factors
             for A.
-	    
+
     C       (output) double*, size A->ncol
             If INFO = 0,  C contains the column scale factors for A.
-	    
+
     ROWCND  (output) double*
-            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
-            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
-            AMAX is neither too large nor too small, it is not worth   
+            If INFO = 0 or INFO > M, ROWCND contains the ratio of the
+            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and
+            AMAX is neither too large nor too small, it is not worth
             scaling by R.
-	    
+
     COLCND  (output) double*
-            If INFO = 0, COLCND contains the ratio of the smallest   
-            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+            If INFO = 0, COLCND contains the ratio of the smallest
+            C(i) to the largest C(i).  If COLCND >= 0.1, it is not
             worth scaling by C.
-	    
+
     AMAX    (output) double*
-            Absolute value of largest matrix element.  If AMAX is very   
-            close to overflow or very close to underflow, the matrix   
+            Absolute value of largest matrix element.  If AMAX is very
+            close to overflow or very close to underflow, the matrix
             should be scaled.
-	    
+
     INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-            > 0:  if INFO = i,  and i is   
-                  <= M:  the i-th row of A is exactly zero   
-                  >  M:  the (i-M)-th column of A is exactly zero   
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i,  and i is
+                  <= M:  the i-th row of A is exactly zero
+                  >  M:  the (i-M)-th column of A is exactly zero
 
     GRID    (input) gridinof_t*
             The 2D process mesh.
-    ===================================================================== 
+    =====================================================================
 </pre>
 */
 
@@ -97,7 +97,7 @@ pzgsequ(SuperMatrix *A, double *r, doubl
     int *r_sizes, *displs;
     double *loc_r;
     int_t  procs;
-    
+
     /* Test the input parameters. */
     *info = 0;
     if ( A->nrow < 0 || A->ncol < 0 ||
@@ -120,7 +120,7 @@ pzgsequ(SuperMatrix *A, double *r, doubl
     Astore = A->Store;
     Aval = Astore->nzval;
     m_loc = Astore->m_loc;
-    
+
     /* Get machine constants. */
     smlnum = dmach_dist("S");
     bignum = 1. / smlnum;
@@ -143,13 +143,13 @@ pzgsequ(SuperMatrix *A, double *r, doubl
 	rcmax = SUPERLU_MAX(rcmax, r[i]);
 	rcmin = SUPERLU_MIN(rcmin, r[i]);
     }
-  
+
     /* Get the global MAX and MIN for R */
     tempmax = rcmax;
     tempmin = rcmin;
-    MPI_Allreduce( &tempmax, &rcmax, 
+    MPI_Allreduce( &tempmax, &rcmax,
 		1, MPI_DOUBLE, MPI_MAX, grid->comm);
-    MPI_Allreduce( &tempmin, &rcmin, 
+    MPI_Allreduce( &tempmin, &rcmin,
 		1, MPI_DOUBLE, MPI_MIN, grid->comm);
 
     *amax = rcmax;
@@ -226,7 +226,7 @@ pzgsequ(SuperMatrix *A, double *r, doubl
 
     /* First gather the size of each piece. */
     MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm);
-      
+
     /* Set up the displacements for allgatherv */
     displs[0] = 0;
     for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1];
@@ -234,7 +234,7 @@ pzgsequ(SuperMatrix *A, double *r, doubl
     /* Now gather the actual data */
     MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs,
                 MPI_DOUBLE, grid->comm);
-      
+
     SUPERLU_FREE(r_sizes);
     SUPERLU_FREE(loc_r);
 
diff -pruN 6.1.0+dfsg1-1/SRC/pzgsmv_AXglobal.c 6.1.1+dfsg1-1/SRC/pzgsmv_AXglobal.c
--- 6.1.0+dfsg1-1/SRC/pzgsmv_AXglobal.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgsmv_AXglobal.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Performs sparse matrix-vector multiplication
  *
  * <pre>
@@ -96,14 +96,14 @@ int pzgsmv_AXglobal_setup
 		    mv_sup_to_proc[i] = p;
 #if ( DEBUGlevel>=3 )
 		    if ( mv_sup_to_proc[i] == p-1 ) {
-			fprintf(stderr, 
+			fprintf(stderr,
 				"mv_sup_to_proc conflicts at supno %d\n", i);
 			exit(-1);
 		    }
 #endif
 		}
 	    }
-	    
+
 	    if ( iam == p ) {
 		N_update = t1;
 		if ( N_update ) {
@@ -162,7 +162,7 @@ int pzgsmv_AXglobal_setup
  *    val[m]        = not used
  *    val[ki]       = A(k, bindx[ki]), where ks <= ki <= ke
  * Both arrays are of length nnz + 1.
- * </pre> 
+ * </pre>
 */
 static void zcreate_msr_matrix
 (
@@ -180,7 +180,7 @@ static void zcreate_msr_matrix
     doublecomplex *nzval;
     int_t *rowcnt;
     doublecomplex zero = {0.0, 0.0};
-    
+
     if ( !N_update ) return;
 
     n = A->ncol;
@@ -277,7 +277,7 @@ pzgsmv_AXglobal(int_t m, int_t update[],
     }
     return 0;
 } /* PZGSMV_AXglobal */
- 
+
 /*
  * Performs sparse matrix-vector multiplication.
  *   - val/bindx stores the distributed MSR matrix A
@@ -300,7 +300,7 @@ pzgsmv_AXglobal_abs(int_t m, int_t updat
 	}
 	ax[i] += slud_z_abs1(&val[i]) * slud_z_abs1(&X[update[i]]); /* diagonal */
     }
-    
+
     return 0;
 } /* PZGSMV_AXglobal_ABS */
 
diff -pruN 6.1.0+dfsg1-1/SRC/pzgsmv.c 6.1.1+dfsg1-1/SRC/pzgsmv.c
--- 6.1.0+dfsg1-1/SRC/pzgsmv.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgsmv.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief  Parallel sparse matrix-vector multiplication
  *
  * <pre>
@@ -143,7 +143,7 @@ void pzgsmv_init
 	    }
 	}
     }
-    
+
     /* ------------------------------------------------------------
        TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES.
        THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A.
@@ -211,7 +211,7 @@ void pzgsmv_init
     gsmv_comm->val_torecv = val_torecv;
     gsmv_comm->TotalIndSend = TotalIndSend;
     gsmv_comm->TotalValSend = TotalValSend;
-    
+
     SUPERLU_FREE(spa);
     SUPERLU_FREE(send_req);
 
@@ -311,7 +311,7 @@ pzgsmv
                       grid->comm, &recv_req[p]);
 	}
     }
-    
+
     /* ------------------------------------------------------------
        PERFORM THE ACTUAL MULTIPLICATION.
        ------------------------------------------------------------*/
diff -pruN 6.1.0+dfsg1-1/SRC/pzgsrfs_ABXglobal.c 6.1.1+dfsg1-1/SRC/pzgsrfs_ABXglobal.c
--- 6.1.0+dfsg1-1/SRC/pzgsrfs_ABXglobal.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgsrfs_ABXglobal.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Improves the computed solution and provies error bounds
  *
  * <pre>
@@ -38,9 +38,9 @@ static void redist_all_to_diag(int_t, do
  * Purpose
  * =======
  *
- * pzgsrfs_ABXglobal improves the computed solution to a system of linear   
+ * pzgsrfs_ABXglobal improves the computed solution to a system of linear
  * equations and provides error bounds and backward error estimates
- * for the solution. 
+ * for the solution.
  *
  * Arguments
  * =========
@@ -78,7 +78,7 @@ static void redist_all_to_diag(int_t, do
  * B      (input) doublecomplex* (global)
  *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
  *        and row permuted system.
- *       
+ *
  *        NOTE: Currently, B must reside on all processes when calling
  *              this routine.
  *
@@ -101,8 +101,8 @@ static void redist_all_to_diag(int_t, do
  *        Number of right-hand sides.
  *
  * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
  *         any element of A or B that makes X(j) an exact solution).
  *
  * stat   (output) SuperLUStat_t*
@@ -112,11 +112,11 @@ static void redist_all_to_diag(int_t, do
  * info   (output) int*
  *        = 0: successful exit
  *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
  *
- * ITMAX is the maximum number of steps of iterative refinement.   
+ * Internal Parameters
+ * ===================
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.
  * </pre>
  */
 
@@ -128,14 +128,14 @@ pzgsrfs_ABXglobal(int_t n, SuperMatrix *
 
 
 #define ITMAX 20
-    
+
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
-    /* 
+    /*
      * Data structures used by matrix-vector multiply routine.
      */
     int_t  N_update; /* Number of variables updated on this process */
-    int_t  *update;  /* vector elements (global index) updated 
+    int_t  *update;  /* vector elements (global index) updated
 			on this processor.                     */
     int_t  *bindx;
     doublecomplex *val;
@@ -160,7 +160,7 @@ pzgsrfs_ABXglobal(int_t n, SuperMatrix *
     /*-- Function prototypes --*/
     extern void pzgstrs1(int_t, LUstruct_t *, gridinfo_t *,
 			 doublecomplex *, int, SuperLUStat_t *, int *);
-    
+
     /* Test the input parameters. */
     *info = 0;
     if ( n < 0 ) *info = -1;
@@ -288,19 +288,19 @@ pzgsrfs_ABXglobal(int_t n, SuperMatrix *
 
 	while (1) { /* Loop until stopping criterion is satisfied. */
 
-	    /* Compute residual R = B - op(A) * X,   
+	    /* Compute residual R = B - op(A) * X,
 	       where op(A) = A, A**T, or A**H, depending on TRANS. */
 
 	    /* Matrix-vector multiply. */
 	    pzgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
-	    
+
 	    /* Compute residual. */
 	    for (i = 0; i < N_update; ++i) z_sub(&R[i], &b[i], &ax[i]);
 
 	    /* Compute abs(op(A))*abs(X) + abs(B). */
 	    pzgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, rwork);
 	    for (i = 0; i < N_update; ++i) rwork[i] += slud_z_abs1(&b[i]);
-	    
+
 	    s = 0.0;
 	    for (i = 0; i < N_update; ++i) {
 		if ( rwork[i] > safe2 ) {
@@ -312,7 +312,7 @@ pzgsrfs_ABXglobal(int_t n, SuperMatrix *
                    we know the true residual also must be exactly 0.0. */
 	    }
 	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
+
 #if ( PRNTlevel>= 1 )
 	    if ( !iam )
 		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
@@ -324,21 +324,21 @@ pzgsrfs_ABXglobal(int_t n, SuperMatrix *
 		pzgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);
 
 		/* Update solution. */
-		for (p = 0; p < num_diag_procs; ++p) 
+		for (p = 0; p < num_diag_procs; ++p)
 		    if ( iam == diag_procs[p] )
 			for (k = p; k < nsupers; k += num_diag_procs) {
 			    lk = LBi( k, grid );
 			    ii = ilsum[lk] + (lk+1)*XK_H;
 			    knsupc = SuperSize( k );
 			    for (i = 0; i < knsupc; ++i)
-				z_add(&x_trs[i + ii], &x_trs[i + ii], 
+				z_add(&x_trs[i + ii], &x_trs[i + ii],
 				      &dx_trs[i + ii]);
 			}
 		lstres = berr[j];
 		++count;
 		/* Transfer x_trs (on diagonal processes) into X
 		   (on all processes). */
-		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, 
+		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid,
 					num_diag_procs, diag_procs, diag_len,
 					X_col, temp);
 	    } else {
@@ -386,7 +386,7 @@ redist_all_to_diag(int_t n, doublecomple
     int_t *ilsum, *xsup;
     int iam, knsupc, psrc, pkk;
     MPI_Status status;
-    
+
     iam = grid->iam;
     nsupers = Glu_persist->supno[n-1] + 1;
     xsup = Glu_persist->xsup;
@@ -435,7 +435,7 @@ gather_1rhs_diag_to_all(int_t n, doublec
     int_t i, ii, k, lk, lwork, nsupers, p;
     int_t *ilsum, *xsup;
     int iam, knsupc, pkk;
-    
+
     iam = grid->iam;
     nsupers = Glu_persist->supno[n-1] + 1;
     xsup = Glu_persist->xsup;
diff -pruN 6.1.0+dfsg1-1/SRC/pzgsrfs.c 6.1.1+dfsg1-1/SRC/pzgsrfs.c
--- 6.1.0+dfsg1-1/SRC/pzgsrfs.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgsrfs.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Improves the computed solution to a system of linear equations and provides error bounds and backward error estimates
  *
  * <pre>
@@ -25,15 +25,15 @@ at the top-level directory.
 #include <math.h>
 #include "superlu_zdefs.h"
 
-/*! \brief 
+/*! \brief
  *
  * <pre>
  * Purpose
  * =======
  *
- * PZGSRFS improves the computed solution to a system of linear   
+ * PZGSRFS improves the computed solution to a system of linear
  * equations and provides error bounds and backward error estimates
- * for the solution. 
+ * for the solution.
  *
  * Arguments
  * =========
@@ -71,7 +71,7 @@ at the top-level directory.
  * B      (input) doublecomplex* (local)
  *        The m_loc-by-NRHS right-hand side matrix of the possibly
  *        equilibrated system. That is, B may be overwritten by diag(R)*B.
- *       
+ *
  * ldb    (input) int (local)
  *        Leading dimension of matrix B.
  *
@@ -97,8 +97,8 @@ at the top-level directory.
  *        solution phase.
  *
  * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
  *         any element of A or B that makes X(j) an exact solution).
  *
  * stat   (output) SuperLUStat_t*
@@ -108,22 +108,22 @@ at the top-level directory.
  * info   (output) int*
  *        = 0: successful exit
  *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
  *
- * ITMAX is the maximum number of steps of iterative refinement.   
+ * Internal Parameters
+ * ===================
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.
  * </pre>
  */
 void
 pzgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
 	ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid,
-	doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, int nrhs, 
+	doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, int nrhs,
 	SOLVEstruct_t *SOLVEstruct,
 	double *berr, SuperLUStat_t *stat, int *info)
 {
 #define ITMAX 20
-    
+
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
     doublecomplex *ax, *R, *dx, *temp, *work, *B_col, *X_col;
@@ -201,12 +201,12 @@ pzgsrfs(int_t n, SuperMatrix *A, double
 
 	while (1) { /* Loop until stopping criterion is satisfied. */
 
-	    /* Compute residual R = B - op(A) * X,   
+	    /* Compute residual R = B - op(A) * X,
 	       where op(A) = A, A**T, or A**H, depending on TRANS. */
 
 	    /* Matrix-vector multiply. */
 	    pzgsmv(0, A, grid, gsmv_comm, X_col, ax);
-	    
+
 	    /* Compute residual, stored in R[]. */
 	    for (i = 0; i < m_loc; ++i) z_sub(&R[i], &B_col[i], &ax[i]);
 
@@ -214,7 +214,7 @@ pzgsrfs(int_t n, SuperMatrix *A, double
 	    pzgsmv(1, A, grid, gsmv_comm, X_col, temp);
             /* NOTE: rtemp is aliased to temp */
 	    for (i = 0; i < m_loc; ++i) rtemp[i] += slud_z_abs1(&B_col[i]);
-	    
+
 	    s = 0.0;
 	    for (i = 0; i < m_loc; ++i) {
 		if ( rtemp[i] > safe2 ) {
@@ -226,7 +226,7 @@ pzgsrfs(int_t n, SuperMatrix *A, double
                    we know the true residual also must be exactly 0.0. */
 	    }
 	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
+
 #if ( PRNTlevel>= 1 )
 	    if ( !iam )
 		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
@@ -234,7 +234,7 @@ pzgsrfs(int_t n, SuperMatrix *A, double
 	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
 		/* Compute new dx. */
 		pzgstrs(n, LUstruct, ScalePermstruct, grid,
-			dx, m_loc, fst_row, m_loc, 1, 
+			dx, m_loc, fst_row, m_loc, 1,
 			SOLVEstruct, stat, info);
 
 		/* Update solution. */
diff -pruN 6.1.0+dfsg1-1/SRC/pzgssvx_ABglobal.c 6.1.1+dfsg1-1/SRC/pzgssvx_ABglobal.c
--- 6.1.0+dfsg1-1/SRC/pzgssvx_ABglobal.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgssvx_ABglobal.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Solves a system of linear equations A*X=B,
  *
  * <pre>
@@ -49,7 +49,7 @@ at the top-level directory.
  *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
  *      -  grid, a structure describing the 2D processor mesh
  *      -  options->IterRefine, which determines whether or not to
- *            improve the accuracy of the computed solution using 
+ *            improve the accuracy of the computed solution using
  *            iterative refinement
  *
  *      On output, B is overwritten with the solution X.
@@ -57,8 +57,8 @@ at the top-level directory.
  *   2. Depending on options->Fact, the user has several options
  *      for solving A*X=B. The standard option is for factoring
  *      A "from scratch". (The other options, described below,
- *      are used when A is sufficiently similar to a previously 
- *      solved problem to save time by reusing part or all of 
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
  *      the previous factorization.)
  *
  *      -  options->Fact = DOFACT: A is factored "from scratch"
@@ -67,7 +67,7 @@ at the top-level directory.
  *
  *      -  A, the input matrix
  *
- *      as well as the following options, which are described in more 
+ *      as well as the following options, which are described in more
  *      detail below:
  *
  *      -  options->Equil,   to specify how to scale the rows and columns
@@ -87,7 +87,7 @@ at the top-level directory.
  *                           (to control numerical stability)
  *
  *      The outputs returned include
- *         
+ *
  *      -  ScalePermstruct,  modified to describe how the input matrix A
  *                           was equilibrated and permuted:
  *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
@@ -98,17 +98,17 @@ at the top-level directory.
  *         -  ScalePermstruct->perm_c, column permutation vector
  *
  *            (part of ScalePermstruct may also need to be supplied on input,
- *             depending on options->RowPerm and options->ColPerm as described 
+ *             depending on options->RowPerm and options->ColPerm as described
  *             later).
  *
  *      -  A, the input matrix A overwritten by the scaled and permuted matrix
  *                Pc*Pr*diag(R)*A*diag(C)
- *             where 
+ *             where
  *                Pr and Pc are row and columns permutation matrices determined
- *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c, 
- *                  respectively, and 
+ *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c,
+ *                  respectively, and
  *                diag(R) and diag(C) are diagonal scaling matrices determined
- *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
  *                  ScalePermstruct->C
  *
  *      -  LUstruct, which contains the L and U factorization of A1 where
@@ -120,7 +120,7 @@ at the top-level directory.
  *
  *   3. The second value of options->Fact assumes that a matrix with the same
  *      sparsity pattern as A has already been factored:
- *     
+ *
  *      -  options->Fact = SamePattern: A is factored, assuming that it has
  *            the same nonzero pattern as a previously factored matrix. In this
  *            case the algorithm saves time by reusing the previously computed
@@ -136,14 +136,14 @@ at the top-level directory.
  *
  *      but not options->ColPerm, whose value is ignored. This is because the
  *      previous column permutation from ScalePermstruct->perm_c is used as
- *      input. The user must also supply 
+ *      input. The user must also supply
  *
  *      -  A, the input matrix
  *      -  ScalePermstruct->perm_c, the column permutation
  *      -  LUstruct->etree, the elimination tree
  *
  *      The outputs returned include
- *         
+ *
  *      -  A, the input matrix A overwritten by the scaled and permuted matrix
  *            as described above
  *      -  ScalePermstruct,  modified to describe how the input matrix A was
@@ -171,32 +171,32 @@ at the top-level directory.
  *      This is because the permutations from ScalePermstruct->perm_r and
  *      ScalePermstruct->perm_c are used as input.
  *
- *      The user must also supply 
+ *      The user must also supply
  *
  *      -  A, the input matrix
  *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
  *                                     column scaled
  *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
- *      -  ScalePermstruct->C, the columns scalings of the previous matrix, 
+ *      -  ScalePermstruct->C, the columns scalings of the previous matrix,
  *                             if any
  *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
- *      -  ScalePermstruct->perm_c, the column permutation of the previous 
+ *      -  ScalePermstruct->perm_c, the column permutation of the previous
  *                                  matrix
  *      -  all of LUstruct, the previously computed information about L and U
  *                (the actual numerical values of L and U stored in
  *                 LUstruct->Llu are ignored)
  *
  *      The outputs returned include
- *         
+ *
  *      -  A, the input matrix A overwritten by the scaled and permuted matrix
  *            as described above
  *      -  ScalePermstruct,  modified to describe how the input matrix A was
- *                           equilibrated 
+ *                           equilibrated
  *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
  *      -  LUstruct, modified to contain the new L and U factors
  *
  *   5. The fourth and last value of options->Fact assumes that A is
- *      identical to a matrix that has already been factored on a previous 
+ *      identical to a matrix that has already been factored on a previous
  *      call, and reuses its entire LU factorization
  *
  *      -  options->Fact = Factored: A is identical to a previously
@@ -204,19 +204,19 @@ at the top-level directory.
  *            can be reused.
  *
  *      In this case all the other options mentioned above are ignored
- *      (options->Equil, options->RowPerm, options->ColPerm, 
+ *      (options->Equil, options->RowPerm, options->ColPerm,
  *       options->ReplaceTinyPivot)
  *
- *      The user must also supply 
+ *      The user must also supply
  *
  *      -  A, the unfactored matrix, only in the case that iterative refinement
- *            is to be done (specifically A must be the output A from 
+ *            is to be done (specifically A must be the output A from
  *            the previous call, so that it has been scaled and permuted)
  *      -  all of ScalePermstruct
  *      -  all of LUstruct, including the actual numerical values of L and U
  *
  *      all of which are unmodified on output.
- *         
+ *
  * Arguments
  * =========
  *
@@ -224,7 +224,7 @@ at the top-level directory.
  *         The structure defines the input parameters to control
  *         how the LU decomposition will be performed.
  *         The following fields should be defined for this structure:
- *         
+ *
  *         o Fact (fact_t)
  *           Specifies whether or not the factored form of the matrix
  *           A is supplied on entry, and if not, how the matrix A should
@@ -234,7 +234,7 @@ at the top-level directory.
  *                 Inputs:  A
  *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          all of ScalePermstruct
  *                          all of LUstruct
@@ -242,7 +242,7 @@ at the top-level directory.
  *           = SamePattern: the matrix A will be factorized assuming
  *             that a factorization of a matrix with the same sparsity
  *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
+ *             factorization will reuse column permutation vector
  *             ScalePermstruct->perm_c and the elimination tree
  *             LUstruct->etree
  *                 Inputs:  A
@@ -250,7 +250,7 @@ at the top-level directory.
  *                          ScalePermstruct->perm_c
  *                          LUstruct->etree
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
  *                          rest of LUstruct (GLU_persist, Llu)
@@ -268,7 +268,7 @@ at the top-level directory.
  *                          all of ScalePermstruct
  *                          all of LUstruct
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          modified LUstruct->Llu
  *           = FACTORED: the matrix A is already factored.
@@ -297,17 +297,17 @@ at the top-level directory.
  *                        off-diagonal.
  *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
  *                        input by the user.
- *           
+ *
  *         o ColPerm (colperm_t)
  *           Specifies what type of column permutation to use to reduce fill.
  *           = NATURAL:       natural ordering.
  *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
  *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
  *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
- *         
+ *
  *         o ReplaceTinyPivot (yes_no_t)
  *           = NO:  do not modify pivots
- *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
  *                  LU factorization.
  *
  *         o IterRefine (IterRefine_t)
@@ -354,7 +354,7 @@ at the top-level directory.
  *                      diag(R).
  *           = COL:     Column equilibration, i.e., A was postmultiplied
  *                      by diag(C).
- *           = BOTH:    both row and column equilibration, i.e., A was 
+ *           = BOTH:    both row and column equilibration, i.e., A was
  *                      replaced by diag(R)*A*diag(C).
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
  *           DiagScale is an input argument; otherwise it is an output
@@ -368,8 +368,8 @@ at the top-level directory.
  *           input argument; otherwise it is an output argument.
  *
  *         o perm_c (int*)
- *           Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
  *           in position j in A*Pc.
  *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
  *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
@@ -381,7 +381,7 @@ at the top-level directory.
  *
  *         o R (double*) dimension (A->nrow)
  *           The row scale factors for A.
- *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
  *                          diag(R).
  *           If DiagScale = NOEQUIL or COL, R is not defined.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
@@ -389,12 +389,12 @@ at the top-level directory.
  *
  *         o C (double*) dimension (A->ncol)
  *           The column scale factors for A.
- *           If DiagScale = COL or BOTH, A is multiplied on the right by 
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
  *                          diag(C).
  *           If DiagScale = NOEQUIL or ROW, C is not defined.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
  *           an input argument; otherwise, C is an output argument.
- *         
+ *
  * B       (input/output) doublecomplex*
  *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
  *         On exit, the solution matrix if info = 0;
@@ -446,8 +446,8 @@ at the top-level directory.
  *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
  *
  * berr    (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
  *         any element of A or B that makes X(j) an exact solution).
  *
  * stat   (output) SuperLUStat_t*
@@ -468,7 +468,7 @@ at the top-level directory.
  * </pre>
  */
 void
-pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, 
+pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 		 ScalePermstruct_t *ScalePermstruct,
 		 doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid,
 		 LUstruct_t *LUstruct, double *berr,
@@ -485,7 +485,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 		                 supernodes in L.
           	   (usub, xusub) contains the compressed subscript of
 		                 nonzero segments in U.
-	      If options->Fact != SamePattern_SameRowPerm, they are 
+	      If options->Fact != SamePattern_SameRowPerm, they are
 	      computed by SYMBFACT routine, and then used by DDISTRIBUTE
 	      routine. They will be freed after DDISTRIBUTE routine.
 	      If options->Fact == SamePattern_SameRowPerm, these
@@ -577,12 +577,12 @@ pzgssvx_ABglobal(superlu_dist_options_t
 		ScalePermstruct->R = R;
 		ScalePermstruct->C = C;
 		break;
-	    case ROW: 
+	    case ROW:
 	        if ( !(C = (double *) doubleMalloc_dist(n)) )
 		    ABORT("Malloc fails for C[].");
 		ScalePermstruct->C = C;
 		break;
-	    case COL: 
+	    case COL:
 		if ( !(R = (double *) doubleMalloc_dist(m)) )
 		    ABORT("Malloc fails for R[].");
 		ScalePermstruct->R = R;
@@ -617,7 +617,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 		    for (i = colptr[j]; i < colptr[j+1]; ++i)
 			zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */
 		break;
-	      case BOTH: 
+	      case BOTH:
 		for (j = 0; j < n; ++j) {
 		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
 			irow = rowind[i];
@@ -631,7 +631,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 	    if ( !iam ) {
 		/* Compute row and column scalings to equilibrate matrix A. */
 		zgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo);
-	    
+
 		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
 		if ( iinfo == 0 ) {
 		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
@@ -643,12 +643,12 @@ pzgssvx_ABglobal(superlu_dist_options_t
 		    if ( iinfo > 0 ) {
 			if ( iinfo <= m ) {
 #if ( PRNTlevel>=1 )
-			    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", 
+			    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n",
 				    iinfo);
 #endif
 			} else {
 #if ( PRNTlevel>=1 )
-                            fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", 
+                            fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n",
 				     iinfo-n);
 #endif
                         }
@@ -662,9 +662,9 @@ pzgssvx_ABglobal(superlu_dist_options_t
 		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
 		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
 		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
-		} 
+		}
 	    }
-	
+
             if ( iinfo == 0 ) {
 	        /* Equilibrate matrix A. */
 	        zlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed);
@@ -694,9 +694,9 @@ pzgssvx_ABglobal(superlu_dist_options_t
 	CHECK_MALLOC(iam, "Exit equil");
 #endif
     } /* end if Equil ... */
-    
+
     /* ------------------------------------------------------------
-       Permute rows of A. 
+       Permute rows of A.
        ------------------------------------------------------------*/
     if ( options->RowPerm != NO ) {
 	t = SuperLU_timer_();
@@ -712,7 +712,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 	} else if ( !factored ) {
 	    if ( job == 5 ) {
 		/* Allocate storage for scaling factors. */
-		if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) 
+		if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) )
 		    ABORT("SUPERLU_MALLOC fails for R1[]");
 		if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) )
 		    ABORT("SUPERLU_MALLOC fails for C1[]");
@@ -723,7 +723,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 		iinfo = zldperm_dist(job, m, nnz, colptr, rowind, a,
                                 perm_r, R1, C1);
 
-                MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );		
+                MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
 		if ( iinfo == 0 ) {
 		    MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
 		    if ( job == 5 && Equil ) {
@@ -777,7 +777,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 		    else for (i = 0; i < m; ++i) R[i] = R1[i];
 		    if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
 		    else for (i = 0; i < n; ++i) C[i] = C1[i];
-		    
+
 		    ScalePermstruct->DiagScale = BOTH;
 		    rowequ = colequ = 1;
 		} else { /* No equilibration. */
@@ -821,12 +821,12 @@ pzgssvx_ABglobal(superlu_dist_options_t
 		if ( !iam ) printf("\t product of diagonal %e\n", dprod);
 	    }
 #endif
-	    
+
         } /* else !factored */
 
 	t = SuperLU_timer_() - t;
 	stat->utime[ROWPERM] = t;
-    
+
     } else { /* options->RowPerm == NOROWPERM */
         for (i = 0; i < m; ++i) perm_r[i] = i;
     }
@@ -845,7 +845,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 	t = SuperLU_timer_();
 	/*
 	 * Get column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
+	 *   permc_spec = NATURAL:  natural ordering
 	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
 	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
 	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
@@ -863,7 +863,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 
 	/* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */
 	ACstore = AC.Store;
-	for (j = 0; j < n; ++j) 
+	for (j = 0; j < n; ++j)
 	    for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) {
 		irow = ACstore->rowind[i];
 		ACstore->rowind[i] = perm_c[irow];
@@ -873,8 +873,8 @@ pzgssvx_ABglobal(superlu_dist_options_t
 	/* Perform a symbolic factorization on matrix A and set up the
 	   nonzero data structures which are suitable for supernodal GENP. */
 	if ( Fact != SamePattern_SameRowPerm ) {
-#if ( PRNTlevel>=1 ) 
-	    if ( !iam ) 
+#if ( PRNTlevel>=1 )
+	    if ( !iam )
 		printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
 		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
 #endif
@@ -883,23 +883,23 @@ pzgssvx_ABglobal(superlu_dist_options_t
 		   SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
 		ABORT("Malloc fails for Glu_freeable.");
 
-	    iinfo = symbfact(options, iam, &AC, perm_c, etree, 
+	    iinfo = symbfact(options, iam, &AC, perm_c, etree,
 			     Glu_persist, Glu_freeable);
 
 	    stat->utime[SYMBFAC] = SuperLU_timer_() - t;
 
 	    if ( iinfo <= 0 ) {
 		QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
-#if ( PRNTlevel>=1 ) 
+#if ( PRNTlevel>=1 )
 		if ( !iam ) {
 		    printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1);
 		    printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]);
 		    printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]);
-		    printf("\tint %d, short %d, float %d, double %d\n", 
-			   (int) sizeof(int_t), (int) sizeof(short), 
+		    printf("\tint %d, short %d, float %d, double %d\n",
+			   (int) sizeof(int_t), (int) sizeof(short),
  			   (int) sizeof(float), (int) sizeof(double));
 		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
-			   symb_mem_usage.for_lu*1e-6, 
+			   symb_mem_usage.for_lu*1e-6,
 			   symb_mem_usage.total*1e-6,
 			   symb_mem_usage.expansions);
 		}
@@ -909,7 +909,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 		if ( !iam )
 		    fprintf(stderr, "symbfact() error returns " IFMT "\n", iinfo);
 #endif
-                *info = iinfo;  
+                *info = iinfo;
                 return;
 	    }
 	}
@@ -961,14 +961,14 @@ pzgssvx_ABglobal(superlu_dist_options_t
 	    }
 	}
 #endif
-    
+
     } else if ( options->IterRefine ) { /* options->Fact==FACTORED */
 	/* Permute columns of A to form A*Pc' using the existing perm_c.
 	 * NOTE: rows of A were previously permuted to Pc*A.
 	 */
 	sp_colorder(options, A, perm_c, NULL, &AC);
     } /* if !factored ... */
-	
+
     /* ------------------------------------------------------------
        Compute the solution matrix X.
        ------------------------------------------------------------*/
@@ -978,7 +978,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 	    ABORT("Malloc fails for b_work[]");
 
 	/* ------------------------------------------------------------
-	   Scale the right-hand side if equilibration was performed. 
+	   Scale the right-hand side if equilibration was performed.
 	   ------------------------------------------------------------*/
 	if ( notran ) {
 	    if ( rowequ ) {
@@ -1056,7 +1056,7 @@ pzgssvx_ABglobal(superlu_dist_options_t
 	    x_col = &X[j*ldx];
 	    for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]];
 	}
-	
+
 	/* Transform the solution matrix X to a solution of the original system
 	   before the equilibration. */
 	if ( notran ) {
@@ -1091,10 +1091,10 @@ pzgssvx_ABglobal(superlu_dist_options_t
 	        SUPERLU_FREE(R);
 		SUPERLU_FREE(C);
 		break;
-	    case ROW: 
+	    case ROW:
 		SUPERLU_FREE(C);
 		break;
-	    case COL: 
+	    case COL:
 		SUPERLU_FREE(R);
 		break;
 	}
diff -pruN 6.1.0+dfsg1-1/SRC/pzgssvx.c 6.1.1+dfsg1-1/SRC/pzgssvx.c
--- 6.1.0+dfsg1-1/SRC/pzgssvx.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgssvx.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Solves a system of linear equations A*X=B
  *
  * <pre>
@@ -64,7 +64,7 @@ at the top-level directory.
  *                   |    .      |        |. |
  *                   |    .      |        |. |
  *                 ---------------       ------
- * 
+ *
  * where, fst_row is the row number of the first row,
  *        m_loc is the number of rows local to this processor
  * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
@@ -79,7 +79,7 @@ at the top-level directory.
  *            and its dimensions ldb (local) and nrhs (global)
  *      -  grid, a structure describing the 2D processor mesh
  *      -  options->IterRefine, which determines whether or not to
- *            improve the accuracy of the computed solution using 
+ *            improve the accuracy of the computed solution using
  *            iterative refinement
  *
  *      On output, B is overwritten with the solution X.
@@ -87,8 +87,8 @@ at the top-level directory.
  *   2. Depending on options->Fact, the user has four options
  *      for solving A*X=B. The standard option is for factoring
  *      A "from scratch". (The other options, described below,
- *      are used when A is sufficiently similar to a previously 
- *      solved problem to save time by reusing part or all of 
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
  *      the previous factorization.)
  *
  *      -  options->Fact = DOFACT: A is factored "from scratch"
@@ -117,7 +117,7 @@ at the top-level directory.
  *                             (to control numerical stability)
  *
  *      The outputs returned include
- *         
+ *
  *        o  ScalePermstruct,  modified to describe how the input matrix A
  *                             was equilibrated and permuted:
  *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
@@ -128,15 +128,15 @@ at the top-level directory.
  *          .  ScalePermstruct->perm_c, column permutation vector
  *
  *          (part of ScalePermstruct may also need to be supplied on input,
- *           depending on options->RowPerm and options->ColPerm as described 
+ *           depending on options->RowPerm and options->ColPerm as described
  *           later).
  *
  *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix diag(R)*A*diag(C)*Pc^T, where 
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
  *              Pc is the row permutation matrix determined by
  *                  ScalePermstruct->perm_c
  *              diag(R) and diag(C) are diagonal scaling matrices determined
- *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
  *                  ScalePermstruct->C
  *
  *        o  LUstruct, which contains the L and U factorization of A1 where
@@ -148,7 +148,7 @@ at the top-level directory.
  *
  *   3. The second value of options->Fact assumes that a matrix with the same
  *      sparsity pattern as A has already been factored:
- *     
+ *
  *      -  options->Fact = SamePattern: A is factored, assuming that it has
  *            the same nonzero pattern as a previously factored matrix. In
  *            this case the algorithm saves time by reusing the previously
@@ -165,14 +165,14 @@ at the top-level directory.
  *
  *      but not options->ColPerm, whose value is ignored. This is because the
  *      previous column permutation from ScalePermstruct->perm_c is used as
- *      input. The user must also supply 
+ *      input. The user must also supply
  *
  *        o  A, the input matrix
  *        o  ScalePermstruct->perm_c, the column permutation
  *        o  LUstruct->etree, the elimination tree
  *
  *      The outputs returned include
- *         
+ *
  *        o  A, the input matrix A overwritten by the scaled and permuted
  *              matrix as described above
  *        o  ScalePermstruct, modified to describe how the input matrix A was
@@ -200,25 +200,25 @@ at the top-level directory.
  *      ignored. This is because the permutations from ScalePermstruct->perm_r
  *      and ScalePermstruct->perm_c are used as input.
  *
- *      The user must also supply 
+ *      The user must also supply
  *
  *        o  A, the input matrix
  *        o  ScalePermstruct->DiagScale, how the previous matrix was row
  *                                       and/or column scaled
  *        o  ScalePermstruct->R, the row scalings of the previous matrix,
  *                               if any
- *        o  ScalePermstruct->C, the columns scalings of the previous matrix, 
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
  *                               if any
  *        o  ScalePermstruct->perm_r, the row permutation of the previous
  *                                    matrix
- *        o  ScalePermstruct->perm_c, the column permutation of the previous 
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
  *                                    matrix
  *        o  all of LUstruct, the previously computed information about
  *                            L and U (the actual numerical values of L and U
  *                            stored in LUstruct->Llu are ignored)
  *
  *      The outputs returned include
- *         
+ *
  *        o  A, the input matrix A overwritten by the scaled and permuted
  *              matrix as described above
  *        o  ScalePermstruct,  modified to describe how the input matrix A was
@@ -227,7 +227,7 @@ at the top-level directory.
  *        o  LUstruct, modified to contain the new L and U factors
  *
  *   5. The fourth and last value of options->Fact assumes that A is
- *      identical to a matrix that has already been factored on a previous 
+ *      identical to a matrix that has already been factored on a previous
  *      call, and reuses its entire LU factorization
  *
  *      -  options->Fact = Factored: A is identical to a previously
@@ -235,10 +235,10 @@ at the top-level directory.
  *            can be reused.
  *
  *      In this case all the other options mentioned above are ignored
- *      (options->Equil, options->RowPerm, options->ColPerm, 
+ *      (options->Equil, options->RowPerm, options->ColPerm,
  *       options->ReplaceTinyPivot)
  *
- *      The user must also supply 
+ *      The user must also supply
  *
  *        o  A, the unfactored matrix, only in the case that iterative
  *              refinement is to be done (specifically A must be the output
@@ -248,7 +248,7 @@ at the top-level directory.
  *           L and U
  *
  *      all of which are unmodified on output.
- *         
+ *
  * Arguments
  * =========
  *
@@ -256,7 +256,7 @@ at the top-level directory.
  *         The structure defines the input parameters to control
  *         how the LU decomposition will be performed.
  *         The following fields should be defined for this structure:
- *         
+ *
  *         o Fact (fact_t)
  *           Specifies whether or not the factored form of the matrix
  *           A is supplied on entry, and if not, how the matrix A should
@@ -266,7 +266,7 @@ at the top-level directory.
  *                 Inputs:  A
  *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          all of ScalePermstruct
  *                          all of LUstruct
@@ -274,7 +274,7 @@ at the top-level directory.
  *           = SamePattern: the matrix A will be factorized assuming
  *             that a factorization of a matrix with the same sparsity
  *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
+ *             factorization will reuse column permutation vector
  *             ScalePermstruct->perm_c and the elimination tree
  *             LUstruct->etree
  *                 Inputs:  A
@@ -282,7 +282,7 @@ at the top-level directory.
  *                          ScalePermstruct->perm_c
  *                          LUstruct->etree
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
  *                          rest of LUstruct (GLU_persist, Llu)
@@ -300,7 +300,7 @@ at the top-level directory.
  *                          all of ScalePermstruct
  *                          all of LUstruct
  *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
+ *                             (possibly row and/or column scaled and/or
  *                              permuted)
  *                          modified LUstruct->Llu
  *           = FACTORED: the matrix A is already factored.
@@ -326,20 +326,20 @@ at the top-level directory.
  *           = LargeDiag_APWM: use the parallel approximate-weight perfect
  *                        matching to permute rows of the original matrix
  *                        to make the diagonal large relative to the
- *                        off-diagonal.								   
+ *                        off-diagonal.
  *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
  *                        input by the user.
- *           
+ *
  *         o ColPerm (colperm_t)
  *           Specifies what type of column permutation to use to reduce fill.
  *           = NATURAL:       natural ordering.
  *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
  *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
  *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
- *         
+ *
  *         o ReplaceTinyPivot (yes_no_t)
  *           = NO:  do not modify pivots
- *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
  *                  LU factorization.
  *
  *         o IterRefine (IterRefine_t)
@@ -380,7 +380,7 @@ at the top-level directory.
  *                      diag(R).
  *           = COL:     Column equilibration, i.e., A was postmultiplied
  *                      by diag(C).
- *           = BOTH:    both row and column equilibration, i.e., A was 
+ *           = BOTH:    both row and column equilibration, i.e., A was
  *                      replaced by diag(R)*A*diag(C).
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
  *           DiagScale is an input argument; otherwise it is an output
@@ -394,8 +394,8 @@ at the top-level directory.
  *           input argument; otherwise it is an output argument.
  *
  *         o perm_c (int*)
- *           Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
  *           in position j in A*Pc.
  *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
  *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
@@ -407,7 +407,7 @@ at the top-level directory.
  *
  *         o R (double*) dimension (A->nrow)
  *           The row scale factors for A.
- *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
  *                          diag(R).
  *           If DiagScale = NOEQUIL or COL, R is not defined.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
@@ -415,12 +415,12 @@ at the top-level directory.
  *
  *         o C (double*) dimension (A->ncol)
  *           The column scale factors for A.
- *           If DiagScale = COL or BOTH, A is multiplied on the right by 
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
  *                          diag(C).
  *           If DiagScale = NOEQUIL or ROW, C is not defined.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
  *           an input argument; otherwise, C is an output argument.
- *         
+ *
  * B       (input/output) doublecomplex* (local)
  *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
  *           where, m_loc is the number of rows stored locally on my
@@ -479,8 +479,8 @@ at the top-level directory.
  *         argument. See superlu_zdefs.h for the definition of 'SOLVEstruct_t'.
  *
  * berr    (output) double*, dimension (nrhs) (global)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
  *         any element of A or B that makes X(j) an exact solution).
  *
  * stat   (output) SuperLUStat_t*
@@ -501,7 +501,7 @@ at the top-level directory.
  */
 
 void
-pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, 
+pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	ScalePermstruct_t *ScalePermstruct,
 	doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid,
 	LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
@@ -521,7 +521,7 @@ pzgssvx(superlu_dist_options_t *options,
 		                 supernodes in L.
           	   (usub, xusub) contains the compressed subscript of
 		                 nonzero segments in U.
-	      If options->Fact != SamePattern_SameRowPerm, they are 
+	      If options->Fact != SamePattern_SameRowPerm, they are
 	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
 	      routine. They will be freed after PDDISTRIBUTE routine.
 	      If options->Fact == SamePattern_SameRowPerm, these
@@ -552,7 +552,7 @@ pzgssvx(superlu_dist_options_t *options,
     int_t nsupers,nsupers_j;
     int_t lk,k,knsupc,nsupr;
     int_t  *lsub,*xsup;
-    doublecomplex *lusup;	
+    doublecomplex *lusup;
 #if ( PRNTlevel>= 2 )
     double   dmin, dsum, dprod;
 #endif
@@ -566,7 +566,7 @@ pzgssvx(superlu_dist_options_t *options,
     int   col, key; /* parameters for creating a new communicator */
     Pslu_freeable_t Pslu_freeable;
     float  flinfo;
-	
+
     /* Initialization. */
     m       = A->nrow;
     n       = A->ncol;
@@ -582,7 +582,7 @@ pzgssvx(superlu_dist_options_t *options,
     symb_comm = MPI_COMM_NULL;
     num_mem_usage.for_lu = num_mem_usage.total = 0.0;
     symb_mem_usage.for_lu = symb_mem_usage.total = 0.0;
-	
+
     /* Test the input parameters. */
     *info = 0;
     Fact = options->Fact;
@@ -619,7 +619,7 @@ pzgssvx(superlu_dist_options_t *options,
     Equil = (!factored && options->Equil == YES);
     notran = (options->Trans == NOTRANS);
     parSymbFact = options->ParSymbFact;
-	
+
     iam = grid->iam;
     job = 5;
     if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
@@ -642,7 +642,7 @@ pzgssvx(superlu_dist_options_t *options,
 #endif
 
     /* Not factored & ask for equilibration */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) { 
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
 	/* Allocate storage if not done so before. */
 	switch ( ScalePermstruct->DiagScale ) {
 	    case NOEQUIL:
@@ -653,12 +653,12 @@ pzgssvx(superlu_dist_options_t *options,
 		ScalePermstruct->R = R;
 		ScalePermstruct->C = C;
 		break;
-	    case ROW: 
+	    case ROW:
 	        if ( !(C = (double *) doubleMalloc_dist(n)) )
 		    ABORT("Malloc fails for C[].");
 		ScalePermstruct->C = C;
 		break;
-	    case COL: 
+	    case COL:
 		if ( !(R = (double *) doubleMalloc_dist(m)) )
 		    ABORT("Malloc fails for R[].");
 		ScalePermstruct->R = R;
@@ -728,7 +728,7 @@ pzgssvx(superlu_dist_options_t *options,
 
 	    /* Now iinfo == 0 */
 
-            /* Equilibrate matrix A if it is badly-scaled. 
+            /* Equilibrate matrix A if it is badly-scaled.
                A <-- diag(R)*A*diag(C)                     */
 	    pzlaqgs(A, R, C, rowcnd, colcnd, amax, equed);
 
@@ -795,7 +795,7 @@ pzgssvx(superlu_dist_options_t *options,
 	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
 	            /* Permute the global matrix GA for symbfact() */
 	            for (i = 0; i < colptr[n]; ++i) {
-	            	irow = rowind[i]; 
+	            	irow = rowind[i];
 		    	rowind[i] = perm_r[irow];
 	            }
 	        } else if ( options->RowPerm == LargeDiag_MC64 ) {
@@ -811,7 +811,7 @@ pzgssvx(superlu_dist_options_t *options,
 	            if ( !iam ) { /* Process 0 finds a row permutation */
 		        iinfo = zldperm_dist(job, m, nnz, colptr, rowind, a_GA,
 		                perm_r, R1, C1);
-		
+
                         MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
 		        if ( iinfo == 0 ) {
 		            MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
@@ -876,7 +876,7 @@ pzgssvx(superlu_dist_options_t *options,
 		            else for (i = 0; i < m; ++i) R[i] = R1[i];
 		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
 		            else for (i = 0; i < n; ++i) C[i] = C1[i];
-		    
+
 		            ScalePermstruct->DiagScale = BOTH;
 		            rowequ = colequ = 1;
 
@@ -952,14 +952,14 @@ pzgssvx(superlu_dist_options_t *options,
     }
 
     /* ------------------------------------------------------------
-       Perform the LU factorization: symbolic factorization, 
+       Perform the LU factorization: symbolic factorization,
        redistribution, and numerical factorization.
        ------------------------------------------------------------*/
     if ( !factored ) {
 	t = SuperLU_timer_();
 	/*
 	 * Get column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
+	 *   permc_spec = NATURAL:  natural ordering
 	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
 	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
 	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
@@ -983,7 +983,7 @@ pzgssvx(superlu_dist_options_t *options,
 		if ( permc_spec == NATURAL ) {
 		     for (j = 0; j < n; ++j) perm_c[j] = j;
                 }
-		if ( !(sizes = intMalloc_dist(2 * noDomains)) ) 
+		if ( !(sizes = intMalloc_dist(2 * noDomains)) )
 		     ABORT("SUPERLU_MALLOC fails for sizes.");
 		if ( !(fstVtxSep = intMalloc_dist(2 * noDomains)) )
 		    ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
@@ -1002,10 +1002,10 @@ pzgssvx(superlu_dist_options_t *options,
 	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
           /* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */
 	  if ( permc_spec == PARMETIS ) {
-	// #pragma omp parallel  
-    // {  	
+	// #pragma omp parallel
+    // {
 	// #pragma omp master
-	// {	
+	// {
 	      /* Get column permutation vector in perm_c.                    *
 	       * This routine takes as input the distributed input matrix A  *
 	       * and does not modify it.  It also allocates memory for       *
@@ -1039,9 +1039,9 @@ pzgssvx(superlu_dist_options_t *options,
 	        /* Compute the elimination tree of Pc*(A^T+A)*Pc^T or Pc*A^T*A*Pc^T
 	           (a.k.a. column etree), depending on the choice of ColPerm.
 	           Adjust perm_c[] to be consistent with a postorder of etree.
-	           Permute columns of A to form A*Pc'. 
+	           Permute columns of A to form A*Pc'.
 		   After this routine, GAC = GA*Pc^T.  */
-	        sp_colorder(options, &GA, perm_c, etree, &GAC); 
+	        sp_colorder(options, &GA, perm_c, etree, &GAC);
 
 	        /* Form Pc*A*Pc^T to preserve the diagonal of the matrix GAC. */
 	        GACstore = (NCPformat *) GAC.Store;
@@ -1057,7 +1057,7 @@ pzgssvx(superlu_dist_options_t *options,
 
 	        /* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up
                    the nonzero data structures for L & U. */
-#if ( PRNTlevel>=1 ) 
+#if ( PRNTlevel>=1 )
                 if ( !iam ) {
 		    printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
 		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
@@ -1070,7 +1070,7 @@ pzgssvx(superlu_dist_options_t *options,
 		    ABORT("Malloc fails for Glu_freeable.");
 
 	    	/* Every process does this. */
-	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree, 
+	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree,
 			     	 Glu_persist, Glu_freeable);
 			nnzLU = Glu_freeable->nnzLU;
 	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
@@ -1081,11 +1081,11 @@ pzgssvx(superlu_dist_options_t *options,
 		    	printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1);
 		    	printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]);
 		    	printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]);
-		    	printf("\tint %d, short %d, float %d, double %d\n", 
+		    	printf("\tint %d, short %d, float %d, double %d\n",
 			       (int) sizeof(int_t), (int) sizeof(short),
         		       (int) sizeof(float), (int) sizeof(double));
 		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
-			   	symb_mem_usage.for_lu*1e-6, 
+			   	symb_mem_usage.for_lu*1e-6,
 			   	symb_mem_usage.total*1e-6,
 			   	symb_mem_usage.expansions);
 			fflush(stdout);
@@ -1103,9 +1103,9 @@ pzgssvx(superlu_dist_options_t *options,
 	    else {  /* parallel symbolic factorization */
 	    	t = SuperLU_timer_();
 	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
-				       sizes, fstVtxSep, &Pslu_freeable, 
+				       sizes, fstVtxSep, &Pslu_freeable,
 				       &(grid->comm), &symb_comm,
-				       &symb_mem_usage); 
+				       &symb_mem_usage);
 			nnzLU = Pslu_freeable.nnzLU;
 	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
 	    	if (flinfo > 0) {
@@ -1127,7 +1127,7 @@ pzgssvx(superlu_dist_options_t *options,
 
         if (sizes) SUPERLU_FREE (sizes);
         if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
-	if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); 
+	if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm);
 
 	/* Distribute entries of A into L & U data structures. */
 	//if (parSymbFact == NO || ???? Fact == SamePattern_SameRowPerm) {
@@ -1136,7 +1136,7 @@ pzgssvx(superlu_dist_options_t *options,
   	    /* Apply column permutation to the original distributed A */
 	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
 
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc^T into L and U storage. 
+	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc^T into L and U storage.
 	       NOTE: the row permutation Pc*Pr is applied internally in the
   	       distribution routine. */
 	    t = SuperLU_timer_();
@@ -1150,7 +1150,7 @@ pzgssvx(superlu_dist_options_t *options,
 	        SUPERLU_FREE(Glu_freeable);
 	    }
 	} else { /* CASE OF PARALLEL SYMBOLIC */
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
+	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
 	       NOTE: the row permutation Pc*Pr is applied internally in the
 	       distribution routine. */
 	    /* Apply column permutation to the original distributed A */
@@ -1161,7 +1161,7 @@ pzgssvx(superlu_dist_options_t *options,
 		  			   &Pslu_freeable, LUstruct, grid);
 	    if (dist_mem_use > 0)
 	        ABORT ("Not enough memory available for dist_psymbtonum\n");
-            
+
 	    stat->utime[DIST] = SuperLU_timer_() - t;
 	}
 
@@ -1169,16 +1169,16 @@ pzgssvx(superlu_dist_options_t *options,
 
 	/* Perform numerical factorization in parallel. */
 	t = SuperLU_timer_();
-    // #pragma omp parallel  
-    // {  	
+    // #pragma omp parallel
+    // {
 	// #pragma omp master
 	// {
 	pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
 	stat->utime[FACT] = SuperLU_timer_() - t;
 	// }
 	// }
-	
-	
+
+
 #if ( PRNTlevel>=2 )
     /* ------------------------------------------------------------
        SUM OVER ALL ENTRIES OF A AND PRINT NNZ AND SIZE OF A.
@@ -1195,15 +1195,15 @@ pzgssvx(superlu_dist_options_t *options,
 		z_add(&asum,&asum,&nzval_a[j]);
 	}
     }
-	
+
 	nsupers = Glu_persist->supno[n-1] + 1;
 	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
-	
-	
-	
+
+
+
 	lsum.r=0.0;
 	lsum.i=0.0;
-	for (lk=0;lk<nsupers_j;++lk){	
+	for (lk=0;lk<nsupers_j;++lk){
 		lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
 		lusup = LUstruct->Llu->Lnzval_bc_ptr[lk];
 		if(lsub){
@@ -1211,23 +1211,23 @@ pzgssvx(superlu_dist_options_t *options,
 			knsupc = SuperSize( k );
 			nsupr = lsub[1];
 			for (j=0; j<knsupc; ++j)
-				for (i = 0; i < nsupr; ++i) 
+				for (i = 0; i < nsupr; ++i)
 					z_add(&lsum,&lsum,&lusup[j*nsupr+i]);
 		}
 	}
-	
-	
+
+
 	MPI_Allreduce( &(asum.r), &(asum_tot.r),1, MPI_DOUBLE, MPI_SUM, grid->comm );
 	MPI_Allreduce( &(asum.i), &(asum_tot.i),1, MPI_DOUBLE, MPI_SUM, grid->comm );
 	MPI_Allreduce( &(lsum.r), &(lsum_tot.r),1, MPI_DOUBLE, MPI_SUM, grid->comm );
 	MPI_Allreduce( &(lsum.i), &(lsum_tot.i),1, MPI_DOUBLE, MPI_SUM, grid->comm );
-	
+
 
 	MPI_Allreduce( &Astore->rowptr[Astore->m_loc], &nnz_tot,1, mpi_int_t, MPI_SUM, grid->comm );
 	// MPI_Bcast( &nnzLU, 1, mpi_int_t, 0, grid->comm );
-	
+
 	MPI_Comm_rank( MPI_COMM_WORLD, &iam_g );
-	
+
     if (!iam_g) {
 	print_options_dist(options);
 	fflush(stdout);
@@ -1235,8 +1235,8 @@ pzgssvx(superlu_dist_options_t *options,
 
     printf(".. Ainfo mygid %5d   mysid %5d   nnz_loc " IFMT "  sum_loc  %e lsum_loc   %e nnz "IFMT " nnzLU %ld sum %e  lsum %e  N "IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum.r+asum.i, lsum.r+lsum.i, nnz_tot,nnzLU,asum_tot.r+asum_tot.i,lsum_tot.r+lsum_tot.i,A->ncol);
 	fflush(stdout);
-#endif				
-			
+#endif
+
 #if 0
 
 // #ifdef GPU_PROF
@@ -1247,7 +1247,7 @@ pzgssvx(superlu_dist_options_t *options,
 
 //      ttemp = getenv("IO_FILE");
 //      if(ttemp!=NULL)
-//      {   
+//      {
 //          printf("File being opend is %s\n",ttemp );
 //          FILE* fp;
 //          fp = fopen(ttemp,"w");
@@ -1297,7 +1297,7 @@ pzgssvx(superlu_dist_options_t *options,
                              num_mem_usage.for_lu  /* distribution step */
                        );
             }
-            
+
 	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
 
 	    MPI_Reduce( &temp, &max,
@@ -1320,17 +1320,17 @@ pzgssvx(superlu_dist_options_t *options,
 		       for_lu * 1e-6, total * 1e-6);
                 printf("** Total highmark (MB):\n"
 		       "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
-		       avg * 1e-6,  
+		       avg * 1e-6,
 		       avg / grid->nprow / grid->npcol * 1e-6,
 		       max * 1e-6);
 		printf("**************************************************\n");
 		fflush(stdout);
             }
 	} /* end printing stats */
-    
+
     } /* end if (!factored) */
 
-    
+
     if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
 	/* Need to reset the solve's communication pattern,
 	   because perm_r[] and/or perm_c[] is changed.    */
@@ -1343,11 +1343,11 @@ pzgssvx(superlu_dist_options_t *options,
     /* Need to revisit: Why the following is not good enough for X-to-B
        distribution -- inv_perm_c changed */
 	pxgstrs_finalize(SOLVEstruct->gstrs_comm);
-	pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
+	pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid,
 	             LUstruct->Glu_persist, SOLVEstruct);
 #endif
 
-	
+
     /* ------------------------------------------------------------
        Compute the solution matrix X.
        ------------------------------------------------------------*/
@@ -1357,7 +1357,7 @@ pzgssvx(superlu_dist_options_t *options,
 	    ABORT("Malloc fails for b_work[]");
 
 	/* ------------------------------------------------------------
-	   Scale the right-hand side if equilibration was performed. 
+	   Scale the right-hand side if equilibration was performed.
 	   ------------------------------------------------------------*/
 	if ( notran ) {
 	    if ( rowequ ) {
@@ -1408,22 +1408,22 @@ pzgssvx(superlu_dist_options_t *options,
 	       factorization with Fact == DOFACT or SamePattern is asked for. */
 	}
 
-	if ( options->DiagInv==YES && 
+	if ( options->DiagInv==YES &&
              (options->SolveInitialized == NO || Fact == SamePattern ||
               Fact == SamePattern_SameRowPerm) ) {
 	    pzCompute_Diag_Inv(n, LUstruct, grid, stat, info);
 	}
 
 
-    // #pragma omp parallel  
-    // {  	
+    // #pragma omp parallel
+    // {
 	// #pragma omp master
 	// {
-	pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, 
+	pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
 		fst_row, ldb, nrhs, SOLVEstruct, stat, info);
 	// }
 	// }
-	
+
 	/* ------------------------------------------------------------
 	   Use iterative refinement to improve the computed solution and
 	   compute error bounds and backward error estimates for it.
@@ -1442,7 +1442,7 @@ pzgssvx(superlu_dist_options_t *options,
 		    pzgsmv_finalize(SOLVEstruct->gsmv_comm);
 	        pzgsmv_init(A, SOLVEstruct->row_to_proc, grid,
 			    SOLVEstruct->gsmv_comm);
-	       
+
                 /* Save a copy of the transformed local col indices
 		   in colind_gsmv[]. */
 	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
@@ -1468,7 +1468,7 @@ pzgssvx(superlu_dist_options_t *options,
 		        }
 		    }
 	        }
-	      
+
 	        /* Re-use the local col indices of A obtained from the
 		   previous call to pzgsmv_init() */
 	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
@@ -1478,10 +1478,10 @@ pzgssvx(superlu_dist_options_t *options,
 	        SOLVEstruct1 = SOLVEstruct;
 	    } else { /* For nrhs > 1, since refinement is performed for RHS
 			one at a time, the communication structure for pdgstrs
-			is different than the solve with nrhs RHS. 
+			is different than the solve with nrhs RHS.
 			So we use SOLVEstruct1 for the refinement step.
 		     */
-	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *) 
+	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *)
 		                       SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) )
 		    ABORT("Malloc fails for SOLVEstruct1");
 	        /* Copy the same stuff */
@@ -1492,12 +1492,12 @@ pzgssvx(superlu_dist_options_t *options,
 	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
 	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
 	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
-		
+
 		/* Initialize the *gstrs_comm for 1 RHS. */
 		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
 		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
 		    ABORT("Malloc fails for gstrs_comm[]");
-		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, 
+		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid,
 			     Glu_persist, SOLVEstruct1);
 	    }
 
@@ -1522,7 +1522,7 @@ pzgssvx(superlu_dist_options_t *options,
 	for (i = 0; i < m_loc; ++i)
 	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
 #endif
-	
+
 	/* Transform the solution matrix X to a solution of the original
 	   system before equilibration. */
 	if ( notran ) {
@@ -1565,10 +1565,10 @@ pzgssvx(superlu_dist_options_t *options,
 	        SUPERLU_FREE(R);
 		SUPERLU_FREE(C);
 		break;
-	    case ROW: 
+	    case ROW:
 		SUPERLU_FREE(C);
 		break;
-	    case COL: 
+	    case COL:
 		SUPERLU_FREE(R);
 		break;
 	}
diff -pruN 6.1.0+dfsg1-1/SRC/pzgstrf2.c 6.1.1+dfsg1-1/SRC/pzgstrf2.c
--- 6.1.0+dfsg1-1/SRC/pzgstrf2.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgstrf2.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Performs panel LU factorization.
  *
  * <pre>
@@ -128,7 +128,7 @@ pzgstrf2_trsm
     u_diag_cnt = 0;
     incy = ld_ujrow;
 
-    if ( U_diag_blk_send_req && 
+    if ( U_diag_blk_send_req &&
 	 U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) {
         /* There are pending sends - wait for all Isend to complete */
 #if ( PROFlevel>=1 )
@@ -154,7 +154,7 @@ pzgstrf2_trsm
             /* Diagonal pivot */
             i = luptr;
            if ( options->ReplaceTinyPivot == YES ) {
-                if ( slud_z_abs1(&lusup[i]) < thresh && 
+                if ( slud_z_abs1(&lusup[i]) < thresh &&
 		     lusup[i].r != 0.0 && lusup[i].i != 0.0 ) { /* Diagonal */
 
 #if ( PRNTlevel>=2 )
@@ -260,7 +260,7 @@ pzgstrf2_trsm
 	stat->ops[FACT] += 4.0 * ((flops_t) nsupc * (nsupc+1) * l);
     } else {  /* non-diagonal process */
         /* ================================================================== *
-         * Receive the diagonal block of U for panel factorization of L(:,k). * 
+         * Receive the diagonal block of U for panel factorization of L(:,k). *
          * Note: we block for panel factorization of L(:,k), but panel        *
 	 * factorization of U(:,k) do not block                               *
          * ================================================================== */
@@ -359,7 +359,7 @@ void pzgstrs2_omp
     nb = usub[0];
     iukp = BR_HEADER;
     rukp = 0;
-    
+
     int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int));
     int* blocks_value_pointers = blocks_index_pointers + nb;
     int* nsupc_temp = blocks_value_pointers + nb;
diff -pruN 6.1.0+dfsg1-1/SRC/pzgstrf.c 6.1.1+dfsg1-1/SRC/pzgstrf.c
--- 6.1.0+dfsg1-1/SRC/pzgstrf.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgstrf.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,9 +1,9 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
@@ -27,11 +27,12 @@ at the top-level directory.
  *   December 31, 2015 rename xMACH to xMACH_DIST.
  *   September 30, 2017 optimization for Intel Knights Landing (KNL) node .
  *   June 1, 2018      add parallel AWPM pivoting; add back arrive_at_ublock()
+ *   February 8, 2019  version 6.1.1
  *
- * Sketch of the algorithm 
+ * Sketch of the algorithm
+ *
+ * =======================
  *
- * ======================= 
- *    
  * The following relations hold:
  *     * A_kk = L_kk * U_kk
  *     * L_ik = Aik * U_kk^(-1)
@@ -115,25 +116,25 @@ at the top-level directory.
 /*#include "cublas_zgemm.h"*/
 // #define NUM_CUDA_STREAMS 16
 // #define NUM_CUDA_STREAMS 16
-#endif 
+#endif
 
 /* Various defininations     */
-/* 
-    Name    : SUPERNODE_PROFILE  
+/*
+    Name    : SUPERNODE_PROFILE
     Purpose : For SuperNode Level profiling of various measurements such as gigaflop/sec
     obtained,bandwidth achieved:
-    Overhead : Low 
+    Overhead : Low
 */
-// #define SUPERNODE_PROFILE   
+// #define SUPERNODE_PROFILE
 
-/* 
+/*
     Name    :   BAELINE
     Purpose : baseline to compare performance against
     Overhead : NA : this won't be used for running experiments
 */
 // #define BASELINE
 
-/* 
+/*
     Name    :   PHI_FRAMEWORK
     Purpose : To simulate and test algorithm used for offloading Phi
     Overhead : NA : this won't be used for running experiments
@@ -412,12 +413,12 @@ pzgstrf(superlu_dist_options_t * options
     if (m == 0 || n == 0) return 0;
 
     double tt1 = SuperLU_timer_ ();
- 
-    /* 
-     * Initialization.  
+
+    /*
+     * Initialization.
      */
     iam = grid->iam;
-    Pc = grid->npcol; 
+    Pc = grid->npcol;
     Pr = grid->nprow;
     myrow = MYROW (iam, grid);
     mycol = MYCOL (iam, grid);
@@ -426,7 +427,7 @@ pzgstrf(superlu_dist_options_t * options
     s_eps = smach_dist("Epsilon");
     thresh = s_eps * anorm;
 
-    MPI_Attr_get (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag);
+    MPI_Comm_get_attr (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag);
     if (!flag) {
         fprintf (stderr, "Could not get TAG_UB\n");
         return (-1);
@@ -504,9 +505,9 @@ pzgstrf(superlu_dist_options_t * options
         }
     }
 
-    log_memory( (Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) 
+    log_memory( (Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1)
 		* iword +
-		(Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) 
+		(Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1)
 		* dword, stat );
 
     /* creating pointers to the look-ahead buffers */
@@ -626,7 +627,7 @@ pzgstrf(superlu_dist_options_t * options
 
 #if ( DEBUGlevel >= 2 )
     PrintInt10("schedule:perm_c_supno", nsupers, perm_c_supno);
-    
+
     /* Turn off static schedule */
     printf("[%d] .. Turn off static schedule for debugging ..\n", iam);
     for (i = 0; i < nsupers; ++i) perm_c_supno[i] = iperm_c_supno[i] = i;
@@ -642,7 +643,7 @@ pzgstrf(superlu_dist_options_t * options
     for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1; /* vectorized */
     log_memory(3 * nsupers * iword, stat);
 
-    /* Sherry: omp parallel? 
+    /* Sherry: omp parallel?
        not worth doing, due to concurrent write to look_ahead_l[jb] */
     for (lb = 0; lb < nrb; ++lb) { /* go through U-factor */
         ib = lb * Pr + myrow;
@@ -739,12 +740,12 @@ pzgstrf(superlu_dist_options_t * options
         fflush(stdout);
     }
 #endif
-   
+
     Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
     Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
     Unzval_br_ptr = Llu->Unzval_br_ptr;
-    ToRecv = Llu->ToRecv; 
+    ToRecv = Llu->ToRecv;
     ToSendD = Llu->ToSendD;
     ToSendR = Llu->ToSendR;
 
@@ -757,7 +758,7 @@ pzgstrf(superlu_dist_options_t * options
 
 #if 0
 #if defined _OPENMP  // Sherry: parallel reduction -- seems slower?
-#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub) 
+#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub)
 #endif
 #endif
     for (int i = mycol; i < nsupers; i += Pc) { /* grab my local columns */
@@ -778,7 +779,7 @@ pzgstrf(superlu_dist_options_t * options
     /* int_t buffer_size =
          SUPERLU_MAX (max_row_size * num_threads * ldt,
                       get_max_buffer_size ());           */
-            
+
 #ifdef GPU_ACC
     int cublas_nb = get_cublas_nb();
     int nstreams = get_num_cuda_streams ();
@@ -817,11 +818,11 @@ pzgstrf(superlu_dist_options_t * options
     /* bigU and bigV are either on CPU or on GPU, not both. */
     doublecomplex* bigU; /* for storing entire U(k,:) panel, prepare for GEMM.
                       bigU has the same size either on CPU or on CPU. */
-    doublecomplex* bigV; /* for storing GEMM output matrix, i.e. update matrix. 
+    doublecomplex* bigV; /* for storing GEMM output matrix, i.e. update matrix.
 	              bigV is large to hold the aggregate GEMM output.*/
     bigU = NULL;
     bigV = NULL;
-				  
+
 #if ( PRNTlevel>=1 )
     if(!iam) {
 	printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
@@ -842,7 +843,7 @@ pzgstrf(superlu_dist_options_t * options
 #endif
     if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(doublecomplex) ,cudaHostAllocDefault)) )
         ABORT("Malloc fails for zgemm buffer V");
- 
+
     DisplayHeader();
 
 #if ( PRNTlevel>=1 )
@@ -853,19 +854,19 @@ pzgstrf(superlu_dist_options_t * options
     handle = (cublasHandle_t *) SUPERLU_MALLOC(sizeof(cublasHandle_t)*nstreams);
     for(int i = 0; i < nstreams; i++) handle[i] = create_handle();
 
-    // creating streams 
+    // creating streams
     cudaStream_t *streams;
     streams = (cudaStream_t *) SUPERLU_MALLOC(sizeof(cudaStream_t)*nstreams);
     for (int i = 0; i < nstreams; ++i)
         checkCuda( cudaStreamCreate(&streams[i]) );
-    
-    // allocating data in device 
+
+    // allocating data in device
     doublecomplex *dA, *dB, *dC;
     cudaError_t cudaStat;
 #if 0
     // cudaStat = cudaMalloc( (void**)&dA, m*k*sizeof(double));
     // HOw much should be the size of dA?
-    // for time being just making it 
+    // for time being just making it
     // cudaStat = cudaMalloc( (void**)&dA, ((max_row_size*sp_ienv_dist(3)))* sizeof(double));
 #endif
 
@@ -889,11 +890,11 @@ pzgstrf(superlu_dist_options_t * options
         return 1;
     }
 
-    stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3) 
+    stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3)
 			  + bigu_size + buffer_size ) * dword;
 
 #else  /* not CUDA */
-    
+
     // for GEMM padding 0
     j = bigu_size / ldt;
     bigu_size += (gemm_k_pad * (j + ldt + gemm_n_pad));
@@ -904,7 +905,7 @@ pzgstrf(superlu_dist_options_t * options
 //    bigV = _mm_malloc(bigv_size * sizeof(doublecomplex), 1<<12);
 //#else
     if ( !(bigU = doublecomplexMalloc_dist(bigu_size)) )
-        ABORT ("Malloc fails for zgemm U buffer"); 
+        ABORT ("Malloc fails for zgemm U buffer");
           //Maximum size of bigU= sqrt(buffsize) ?
     // int bigv_size = 8 * ldt * ldt * num_threads;
     if ( !(bigV = doublecomplexMalloc_dist(bigv_size)) )
@@ -915,7 +916,7 @@ pzgstrf(superlu_dist_options_t * options
 
     log_memory((bigv_size + bigu_size) * dword, stat);
 
-    // mlock(bigU,(bigu_size) * sizeof (double));   
+    // mlock(bigU,(bigu_size) * sizeof (double));
 
 #if ( PRNTlevel>=1 )
     if(!iam) {
@@ -951,7 +952,7 @@ pzgstrf(superlu_dist_options_t * options
 
     int_t mrb = (nsupers + Pr - 1) / Pr;
     int_t mcb = (nsupers + Pc - 1) / Pc;
-    
+
     RemainStRow     = intMalloc_dist(mrb);
 #if 0
     Remain_lptr     = (int *) _mm_malloc(sizeof(int)*mrb,1);
@@ -960,7 +961,7 @@ pzgstrf(superlu_dist_options_t * options
 #endif
     // mlock(Remain_lptr, sizeof(int)*mrb );
     Remain_ib       = intMalloc_dist(mrb);
-    
+
     Remain_info_t *Remain_info;
 #if 0
     Remain_info = (Remain_info_t *) _mm_malloc(mrb*sizeof(Remain_info_t),64);
@@ -1017,7 +1018,7 @@ pzgstrf(superlu_dist_options_t * options
         PZGSTRF2 (options, k0, k, thresh, Glu_persist, grid, Llu,
                   U_diag_blk_send_req, tag_ub, stat, info);
 
-        pdgstrf2_timer += SuperLU_timer_()-ttt1; 
+        pdgstrf2_timer += SuperLU_timer_()-ttt1;
 
         scp = &grid->rscp;      /* The scope of process row. */
 
@@ -1142,7 +1143,7 @@ pzgstrf(superlu_dist_options_t * options
                     PZGSTRF2 (options, kk0, kk, thresh, Glu_persist,
                               grid, Llu, U_diag_blk_send_req, tag_ub, stat, info);
 
-                     pdgstrf2_timer += SuperLU_timer_() - ttt1; 
+                     pdgstrf2_timer += SuperLU_timer_() - ttt1;
 
                     /* Multicasts numeric values of L(:,kk) to process rows. */
                     /* ttt1 = SuperLU_timer_(); */
@@ -1243,7 +1244,7 @@ pzgstrf(superlu_dist_options_t * options
         kk1 = k0;
         kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
         for (kk0 = kk1; kk0 < kk2; kk0++) {
-            kk = perm_c_supno[kk0]; /* order determined from static schedule */  
+            kk = perm_c_supno[kk0]; /* order determined from static schedule */
             if (factoredU[kk0] != 1 && look_ahead[kk] < k0) {
 		/* does not depend on current column k */
                 kcol = PCOL (kk, grid);
@@ -1309,7 +1310,7 @@ pzgstrf(superlu_dist_options_t * options
                             PZGSTRS2 (kk0, kk, Glu_persist, grid, Llu,
                                       stat);
                         }
-    
+
                         pdgstrs2_timer += SuperLU_timer_()-ttt2;
                         /* stat->time8 += SuperLU_timer_()-ttt2; */
 
@@ -1415,7 +1416,7 @@ pzgstrf(superlu_dist_options_t * options
                 } else {
                     msgcnt[0] = msgcntsU[look_id][0];
 #if (DEBUGlevel>=2)
-		    printf("\t[%d] k=%d, look_id=%d, recv_req[0] == MPI_REQUEST_NULL, msgcnt[0] = %d\n", 
+		    printf("\t[%d] k=%d, look_id=%d, recv_req[0] == MPI_REQUEST_NULL, msgcnt[0] = %d\n",
 			   iam, k, look_id, msgcnt[0]);
 #endif
                 }
@@ -1427,7 +1428,7 @@ pzgstrf(superlu_dist_options_t * options
                 } else {
                     msgcnt[1] = msgcntsU[look_id][1];
 #if (DEBUGlevel>=2)
-		    printf("\t[%d] k=%d, look_id=%d, recv_req[1] == MPI_REQUEST_NULL, msgcnt[1] = %d\n", 
+		    printf("\t[%d] k=%d, look_id=%d, recv_req[1] == MPI_REQUEST_NULL, msgcnt[1] = %d\n",
 			   iam, k, look_id, msgcnt[1]);
 #endif
                 }
@@ -1467,14 +1468,14 @@ pzgstrf(superlu_dist_options_t * options
             if (factoredU[k0] == -1) {
                 /* Parallel triangular solve across process row *krow* --
                    U(k,j) = L(k,k) \ A(k,j).  */
-                 double ttt2 = SuperLU_timer_(); 
+                 double ttt2 = SuperLU_timer_();
 #ifdef _OPENMP
 /* #pragma omp parallel */ /* Sherry -- parallel done inside pzgstrs2 */
 #endif
                 {
                     PZGSTRS2 (k0, k, Glu_persist, grid, Llu, stat);
                 }
-                pdgstrs2_timer += SuperLU_timer_() - ttt2; 
+                pdgstrs2_timer += SuperLU_timer_() - ttt2;
 
 	        /* Sherry -- need to set factoredU[k0] = 1; ?? */
 
@@ -1496,7 +1497,7 @@ pzgstrf(superlu_dist_options_t * options
                                       SLU_MPI_TAG (2, k0), /* (4*k0+2)%tag_ub */
                                       scp->comm);
                             MPI_Send (uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX, pi,
-                                      SLU_MPI_TAG (3, k0), /* (4*k0+3)%tag_ub */ 
+                                      SLU_MPI_TAG (3, k0), /* (4*k0+3)%tag_ub */
                                       scp->comm);
 #if ( PROFlevel>=1 )
                             TOC (t2, t1);
@@ -1624,9 +1625,9 @@ pzgstrf(superlu_dist_options_t * options
             }
             iukp = iukp0;
 #ifdef ISORT
-            /* iperm_u is sorted based on elimination order; 
+            /* iperm_u is sorted based on elimination order;
                perm_u reorders the U blocks to match the elimination order. */
-            isort (nub, iperm_u, perm_u); 
+            isort (nub, iperm_u, perm_u);
 #else
             qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
                    &superlu_sort_perm);
@@ -1686,11 +1687,11 @@ pzgstrf(superlu_dist_options_t * options
                         /* Factor diagonal and subdiagonal blocks and
 			   test for exact singularity.  */
                         factored[kk] = 0; /* flag column kk as factored */
-                        double ttt1 = SuperLU_timer_(); 
+                        double ttt1 = SuperLU_timer_();
                         PZGSTRF2 (options, kk0, kk, thresh,
                                   Glu_persist, grid, Llu, U_diag_blk_send_req,
                                   tag_ub, stat, info);
-                        pdgstrf2_timer += SuperLU_timer_() - ttt1; 
+                        pdgstrf2_timer += SuperLU_timer_() - ttt1;
 
                         /* Process column *kcol+1* multicasts numeric
 			   values of L(:,k+1) to process rows. */
@@ -1739,18 +1740,18 @@ pzgstrf(superlu_dist_options_t * options
 
 #include "zSchCompUdt-cuda.c"
 
-#else 
+#else
 
 /*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/
 //#include "zSchCompUdt-2Ddynamic_v6.c"
 
 #include "zSchCompUdt-2Ddynamic.c"
 
-#endif 
+#endif
 	/*uncomment following to compare against SuperLU 3.3 baseline*/
         /* #include "SchCompUdt--baseline.c"  */
 	/************************************************************************/
-        
+
         NetSchurUpTimer += SuperLU_timer_() - tsch;
 
     }  /* MAIN LOOP for k0 = 0, ... */
@@ -1758,7 +1759,7 @@ pzgstrf(superlu_dist_options_t * options
     /* ##################################################################
        ** END MAIN LOOP: for k0 = ...
        ################################################################## */
-    
+
     pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer;
 
 #if ( PRNTlevel>=2 )
@@ -1779,13 +1780,13 @@ pzgstrf(superlu_dist_options_t * options
         printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer);
         printf(".. Time to Gather L buffer\t %8.2lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
         printf(".. Time to Gather U buffer\t %8.2lf \n", GatherUTimer);
-	       
+
         printf(".. Time in GEMM %8.2lf \n",
 	       LookAheadGEMMTimer + RemainGEMMTimer);
         printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer);
-        printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n", 
+        printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n",
 	       RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9);
-        printf(".. Time to Scatter %8.2lf \n", 
+        printf(".. Time to Scatter %8.2lf \n",
 	       LookAheadScatterTimer + RemainScatterTimer);
         printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer);
         printf("\t* Remain\t %8.2lf \n", RemainScatterTimer);
@@ -1795,7 +1796,7 @@ pzgstrf(superlu_dist_options_t * options
 	printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
     }
 #endif
-    
+
 #if ( DEBUGlevel>=3 )
     for (i = 0; i < Pr * Pc; ++i) {
         if (iam == i) {
@@ -1832,7 +1833,7 @@ pzgstrf(superlu_dist_options_t * options
     log_memory( -((Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) * iword +
 		  (Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) * dword),
 		stat );
-    
+
     SUPERLU_FREE (Lsub_buf_2);
     SUPERLU_FREE (Lval_buf_2);
     SUPERLU_FREE (Usub_buf_2);
@@ -1914,7 +1915,7 @@ pzgstrf(superlu_dist_options_t * options
     SUPERLU_FREE(Remain_info);
     SUPERLU_FREE(lookAhead_L_buff);
     SUPERLU_FREE(Remain_L_buff);
-    log_memory( -(3 * mrb * iword + mrb * sizeof(Remain_info_t) + 
+    log_memory( -(3 * mrb * iword + mrb * sizeof(Remain_info_t) +
 		  ldt * ldt * (num_look_aheads + 1) * dword +
 		  Llu->bufmax[1] * dword), stat );
 
@@ -1966,7 +1967,7 @@ pzgstrf(superlu_dist_options_t * options
 	    for (i = 0; i < gemm_count; ++i)
 		fprintf(fgemm, "%8d%8d%8d\t %20.16e\t%8d\n", gemm_stats[i].m, gemm_stats[i].n,
 			gemm_stats[i].k, gemm_stats[i].microseconds, prof_sendR[i]);
-	    
+
 	    fclose(fgemm);
         }
 	SUPERLU_FREE(gemm_stats);
diff -pruN 6.1.0+dfsg1-1/SRC/pzgstrs1.c 6.1.1+dfsg1-1/SRC/pzgstrs1.c
--- 6.1.0+dfsg1-1/SRC/pzgstrs1.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgstrs1.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Solves a system of distributed linear equations
  *
  * <pre>
@@ -34,7 +34,7 @@ at the top-level directory.
 #ifdef _CRAY
 fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
 		   doublecomplex*, int*, doublecomplex*, int*);
-fortran void SGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*,
 		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
 _fcd ftcs1;
 _fcd ftcs2;
@@ -58,7 +58,7 @@ _fcd ftcs3;
  * This routine is used only in the iterative refinement routine
  * pzgsrfs_ABXglobal, assuming that the right-hand side is already
  * distributed in the diagonal processes.
- * 
+ *
  * Arguments
  * =========
  *
@@ -84,13 +84,13 @@ _fcd ftcs3;
  *        Number of right-hand sides.
  *
  * stat   (output) SuperLUStat_t*
- *        Record the statistics about the triangular solves; 
+ *        Record the statistics about the triangular solves;
  *        See SuperLUStat_t structure defined in util.h.
  *
  * info   (output) int*
  * 	   = 0: successful exit
  *	   < 0: if info = -i, the i-th argument had an illegal value
- * </pre>      
+ * </pre>
  */
 
 void pzgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
@@ -157,7 +157,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 	pxerr_dist("PZGSTRS1", grid, -*info);
 	return;
     }
-	
+
     /*
      * Initialization.
      */
@@ -205,7 +205,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 
     /* Allocate working storage. */
     knsupc = sp_ienv_dist(3);
-    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs 
+    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs
         + nlb * LSUM_H)) )
 	ABORT("Calloc fails for lsum[].");
     maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H);
@@ -214,7 +214,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
     if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) )
 	ABORT("Malloc fails for rtemp[].");
 
-    
+
     /*---------------------------------------------------
      * Forward solve Ly = b.
      *---------------------------------------------------*/
@@ -228,8 +228,8 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 	if ( myrow == krow ) {
 	    lk = LBi( k, grid );   /* Local block number. */
 	    il = LSUM_BLK( lk );
-	    lsum[il - LSUM_H].r = k; 
-	    lsum[il - LSUM_H].i = 0; 
+	    lsum[il - LSUM_H].r = k;
+	    lsum[il - LSUM_H].i = 0;
 	}
     }
 
@@ -251,7 +251,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 	    }
 	}
 	/*PrintInt10("mod_bit", nlb, mod_bit);*/
-	
+
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
 	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
@@ -316,10 +316,10 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 		CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 		      lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		/*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
@@ -327,7 +327,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 #if ( DEBUGlevel>=2 )
 		printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
-		
+
 		/*
 		 * Send Xk to process column Pc[k].
 		 */
@@ -340,7 +340,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
                                    &send_req[Llu->SolveMsgSent++]);
 #else
 			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 SuperLU_MPI_DOUBLE_COMPLEX, 
+				 SuperLU_MPI_DOUBLE_COMPLEX,
                                  pi, Xk, grid->comm );
 #endif
 #if ( DEBUGlevel>=2 )
@@ -348,14 +348,14 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 			       iam, x[ii-XK_H], pi);
 #endif
 		    }
-		
+
 		/*
 		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
 		 */
 		nb = lsub[0] - 1;
 		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
 		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
+
 		zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
 			   fmod, nb, lptr, luptr, xsup, grid, Llu,
 			   send_req, stat);
@@ -389,7 +389,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 #if ( DEBUGlevel>=2 )
 	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
 #endif
-	
+
 	switch ( status.MPI_TAG ) {
 	  case Xk:
 	      --nfrecvx;
@@ -433,17 +433,17 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 		  CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 			lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 			 lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		  /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
 #if ( DEBUGlevel>=2 )
 		  printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
-		
+
 		  /*
 		   * Send Xk to process column Pc[k].
 		   */
@@ -625,7 +625,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
     if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
         ABORT("Malloc fails for Ucb_valptr[]");
 
-    /* Count number of row blocks in a block column. 
+    /* Count number of row blocks in a block column.
        One pass of the skeleton graph of U. */
     for (lk = 0; lk < nlb; ++lk) {
 	usub = Ufstnz_br_ptr[lk];
@@ -682,7 +682,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 		    for (i = 0; i < Urbs[lb]; ++i)
 			printf("(%2d) .. row blk %2d:\
                                lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
+			       iam, i,
 			       Ucb_indptr[lb][i].lbnum,
 			       Ucb_indptr[lb][i].indpos,
 			       Ucb_valptr[lb][i]);
@@ -735,10 +735,10 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 		CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 		      lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		/*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
@@ -765,11 +765,11 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 			       iam, x[ii-XK_H], pi);
 #endif
 		    }
-		
+
 		/*
 		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
 		 */
-		if ( Urbs[lk] ) 
+		if ( Urbs[lk] )
 		    zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
 			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 			       send_req, stat);
@@ -784,7 +784,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
     while ( nbrecvx || nbrecvmod ) { /* While not finished. */
 
 	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, 
+	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
 	k = (*recvbuf).r;
 
@@ -827,10 +827,10 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 			  lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		    /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
@@ -857,9 +857,9 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 				   iam, x[ii - XK_H], pi);
 #endif
 			}
-		
+
 		    /*
-		     * Perform local block modifications: 
+		     * Perform local block modifications:
 		     *         lsum[i] -= U_i,k * X[k]
 		     */
 		    if ( Urbs[lk] )
@@ -867,14 +867,14 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
 				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 				   send_req, stat);
 		} /* if becomes solvable */
-		
+
 		break;
 
 #if ( DEBUGlevel>=2 )
 	      default:
 		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
 		break;
-#endif		
+#endif
 
 	} /* switch */
 
@@ -905,7 +905,7 @@ void pzgstrs1(int_t n, LUstruct_t *LUstr
     for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
     SUPERLU_FREE(send_req);
 #endif
-    
+
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Exit pzgstrs1()");
 #endif
diff -pruN 6.1.0+dfsg1-1/SRC/pzgstrs_Bglobal.c 6.1.1+dfsg1-1/SRC/pzgstrs_Bglobal.c
--- 6.1.0+dfsg1-1/SRC/pzgstrs_Bglobal.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgstrs_Bglobal.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Solves a system of distributed linear equations A*X = B with a general N-by-N matrix A using the LU factorization
  *
  * <pre>
@@ -34,7 +34,7 @@ at the top-level directory.
 #ifdef _CRAY
 fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
 		   doublecomplex*, int*, doublecomplex*, int*);
-fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
+fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*,
 		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
 _fcd ftcs1;
 _fcd ftcs2;
@@ -53,7 +53,7 @@ static void gather_diag_to_all(int_t, in
  * pzgstrs_Bglobal solves a system of distributed linear equations
  * A*X = B with a general N-by-N matrix A using the LU factorization
  * computed by pzgstrf.
- * 
+ *
  * Arguments
  * =========
  *
@@ -80,7 +80,7 @@ static void gather_diag_to_all(int_t, in
  *        On exit, the solution matrix of the possibly equilibrated
  *        and row permuted system if info = 0;
  *
- *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all 
+ *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all
  *              processes when calling this routine.
  *
  * ldb    (input) int (global)
@@ -96,12 +96,12 @@ static void gather_diag_to_all(int_t, in
  * info   (output) int*
  * 	   = 0: successful exit
  *	   < 0: if info = -i, the i-th argument had an illegal value
- * </pre>    
+ * </pre>
  */
 
 void
-pzgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, 
-                doublecomplex *B, int_t ldb, int nrhs, 
+pzgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
+                doublecomplex *B, int_t ldb, int nrhs,
                 SuperLUStat_t *stat, int *info)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
@@ -155,7 +155,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 #endif
 
     int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
- 
+
     t = SuperLU_timer_();
 
     /* Test input parameters. */
@@ -166,7 +166,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 	pxerr_dist("PZGSTRS_BGLOBAL", grid, -*info);
 	return;
     }
-	
+
     /*
      * Initialization.
      */
@@ -216,10 +216,10 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
     /* Allocate working storage. */
     knsupc = sp_ienv_dist(3);
     maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
-    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs 
+    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs
         + nlb * LSUM_H)) )
 	ABORT("Calloc fails for lsum[].");
-    if ( !(x = doublecomplexMalloc_dist(((size_t)ldalsum) * nrhs 
+    if ( !(x = doublecomplexMalloc_dist(((size_t)ldalsum) * nrhs
         + nlb * XK_H)) )
 	ABORT("Malloc fails for x[].");
     if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) )
@@ -227,7 +227,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
     if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) )
 	ABORT("Malloc fails for rtemp[].");
 
-    
+
     /*---------------------------------------------------
      * Forward solve Ly = b.
      *---------------------------------------------------*/
@@ -274,7 +274,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
 	    }
 	}
-	
+
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
 	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
@@ -339,10 +339,10 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 		      lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
@@ -351,7 +351,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 #if ( DEBUGlevel>=2 )
 		printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
-		
+
 		/*
 		 * Send Xk to process column Pc[k].
 		 */
@@ -369,7 +369,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 #else
 
 			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				  SuperLU_MPI_DOUBLE_COMPLEX, 
+				  SuperLU_MPI_DOUBLE_COMPLEX,
                                   pi, Xk, grid->comm );
 #endif
 #endif
@@ -385,9 +385,9 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		nb = lsub[0] - 1;
 		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
 		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
+
 		zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			   fmod, nb, lptr, luptr, xsup, grid, Llu,
 			   send_req,stat);
 	    }
 	} /* if diagonal process ... */
@@ -421,7 +421,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 #if ( DEBUGlevel>=2 )
 	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
 #endif
-	
+
 	switch ( status.MPI_TAG ) {
 	  case Xk:
 	      --nfrecvx;
@@ -438,7 +438,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
 		   */
 		  zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
 			     send_req, stat);
 	      } /* if lsub */
 
@@ -466,10 +466,10 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		  CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 			lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 			 lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		  stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
@@ -478,7 +478,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 #if ( DEBUGlevel>=2 )
 		  printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
-		
+
 		  /*
 		   * Send Xk to process column Pc[k].
 		   */
@@ -488,7 +488,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 			  pi = PNUM( p, kcol, grid );
 #ifdef ISEND_IRECV
 			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, 
+				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
 				     &send_req[Llu->SolveMsgSent++]);
 #else
 #ifdef BSEND
@@ -519,7 +519,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 
 	      break;
 
-#if ( DEBUGlevel>=2 )	      
+#if ( DEBUGlevel>=2 )
 	    default:
 	      printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
 	      break;
@@ -665,7 +665,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
     if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
         ABORT("Malloc fails for Ucb_valptr[]");
 
-    /* Count number of row blocks in a block column. 
+    /* Count number of row blocks in a block column.
        One pass of the skeleton graph of U. */
     for (lk = 0; lk < nlb; ++lk) {
 	usub = Ufstnz_br_ptr[lk];
@@ -723,7 +723,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		    for (i = 0; i < Urbs[lb]; ++i)
 			printf("(%2d) .. row blk %2d:\
                                lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
+			       iam, i,
 			       Ucb_indptr[lb][i].lbnum,
 			       Ucb_indptr[lb][i].indpos,
 			       Ucb_valptr[lb][i]);
@@ -776,10 +776,10 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 		      lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 		       lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
@@ -816,7 +816,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		/*
 		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
 		 */
-		if ( Urbs[lk] ) 
+		if ( Urbs[lk] )
 		    zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
 			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 			       send_req, stat);
@@ -833,7 +833,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 	/* Receive a message. */
 	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE,
 		 MPI_ANY_TAG, grid->comm, &status );
-	
+
 	k = (*recvbuf).r;
 
 #if ( DEBUGlevel>=2 )
@@ -849,7 +849,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		 *         lsum[i] -= U_i,k * X[k]
 		 */
 		zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
-			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 			   send_req, stat);
 
 	        break;
@@ -876,10 +876,10 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 			  lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &knsupc);
 #endif
 		    stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
@@ -914,7 +914,7 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 			}
 		    }
 		    /*
-		     * Perform local block modifications: 
+		     * Perform local block modifications:
 		     *         lsum[i] -= U_i,k * X[k]
 		     */
 		    if ( Urbs[lk] )
@@ -922,14 +922,14 @@ pzgstrs_Bglobal(int_t n, LUstruct_t *LUs
 				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 				   send_req, stat);
 		} /* if becomes solvable */
-		
+
 		break;
 
 #if ( DEBUGlevel>=2 )
 	      default:
 		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
 		break;
-#endif		
+#endif
 
 	} /* switch */
 
@@ -1007,7 +1007,7 @@ gather_diag_to_all(int_t n, int_t nrhs,
     int_t *ilsum, *xsup;
     int iam, knsupc, pkk;
     doublecomplex *x_col, *y_col;
-    
+
     iam = grid->iam;
     nsupers = Glu_persist->supno[n-1] + 1;
     xsup = Glu_persist->xsup;
diff -pruN 6.1.0+dfsg1-1/SRC/pzgstrs.c 6.1.1+dfsg1-1/SRC/pzgstrs.c
--- 6.1.0+dfsg1-1/SRC/pzgstrs.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgstrs.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,23 +1,24 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Solves a system of distributed linear equations A*X = B with a
  * general N-by-N matrix A using the LU factors computed previously.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.0) --
+ * -- Distributed SuperLU routine (version 6.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 15, 2008
  * September 18, 2018  version 6.0
+ * February 8, 2019  version 6.1.1
  * </pre>
  */
 #include <math.h>
@@ -35,13 +36,13 @@ at the top-level directory.
  *   while ( not finished ) { .. use message counter to control
  *
  *      reveive a message;
- * 	
+ *
  * 	if ( message is Xk ) {
  * 	    perform local block modifications into lsum[];
  *                 lsum[i] -= L_i,k * X[k]
  *          if all local updates done, Isend lsum[] to diagonal process;
  *
- *      } else if ( message is LSUM ) { .. this must be a diagonal process 
+ *      } else if ( message is LSUM ) { .. this must be a diagonal process
  *          accumulate LSUM;
  *          if ( all LSUM are received ) {
  *              perform triangular solve for Xi;
@@ -51,7 +52,7 @@ at the top-level directory.
  *      }
  *   }
  *
- * 
+ *
  * Auxiliary data structures: lsum[] / ilsum (pointer to lsum array)
  * =======================
  *
@@ -66,7 +67,7 @@ at the top-level directory.
  *         | | |  <- header of size 2     ---
  *         --------- <--------------------| |
  *         | | | | |			  ---
- * 	   | | | | |	      |-----------| |		
+ * 	   | | | | |	      |-----------| |
  *         | | | | | 	      |           ---
  *	   ---------          |   |-------| |
  *         | | |  <- header   |   |       ---
@@ -82,7 +83,7 @@ at the top-level directory.
  *         | | | | |                 |
  *	   --------- <---------------|
  */
-  
+
 /*#define ISEND_IRECV*/
 
 /*
@@ -102,7 +103,7 @@ _fcd ftcs3;
  * Purpose
  * =======
  *   Re-distribute B on the diagonal processes of the 2D process mesh.
- * 
+ *
  * Note
  * ====
  *   This routine can only be called after the routine pxgstrs_init(),
@@ -110,7 +111,7 @@ _fcd ftcs3;
  *
  * Arguments
  * =========
- * 
+ *
  * B      (input) doublecomplex*
  *        The distributed right-hand side matrix of the possibly
  *        equilibrated system.
@@ -197,8 +198,8 @@ pzReDistribute_B_to_X(doublecomplex *B,
        NOW COMMUNICATE THE ACTUAL DATA.
        ------------------------------------------------------------*/
 
-	if(procs==1){ // faster memory copy when procs=1 
-	
+	if(procs==1){ // faster memory copy when procs=1
+
 #ifdef _OPENMP
 #pragma omp parallel default (shared)
 #endif
@@ -206,21 +207,21 @@ pzReDistribute_B_to_X(doublecomplex *B,
 #ifdef _OPENMP
 #pragma omp master
 #endif
-	{	
+	{
 		// t = SuperLU_timer_();
 #ifdef _OPENMP
-#pragma	omp	taskloop private (i,l,irow,k,j,knsupc) untied 
+#pragma	omp	taskloop private (i,l,irow,k,j,knsupc) untied
 #endif
 		for (i = 0; i < m_loc; ++i) {
 			irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*B */
-	   
+
 			k = BlockNum( irow );
 			knsupc = SuperSize( k );
 			l = X_BLK( k );
-			
+
 			x[l - XK_H].r = k; /* Block number prepended in the header. */
 			x[l - XK_H].i = 0;
-			
+
 			irow = irow - FstBlockC(k); /* Relative row number in X-block */
 			RHS_ITERATE(j) {
 			x[l + irow + j*knsupc] = B[i + j*ldb];
@@ -238,19 +239,19 @@ pzReDistribute_B_to_X(doublecomplex *B,
 			ABORT("Malloc fails for send_dbuf[].");
 		recv_dbuf = send_dbuf + k * nrhs;
 		if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
-			ABORT("Malloc fails for req_send[].");	
+			ABORT("Malloc fails for req_send[].");
 		if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
 			ABORT("Malloc fails for req_recv[].");
 		if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
 			ABORT("Malloc fails for status_send[].");
 		if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
 			ABORT("Malloc fails for status_recv[].");
-		
+
 		for (p = 0; p < procs; ++p) {
 			ptr_to_ibuf[p] = sdispls[p];
 			ptr_to_dbuf[p] = sdispls[p] * nrhs;
 		}
-		
+
 		/* Copy the row indices and values to the send buffer. */
 		// t = SuperLU_timer_();
 		for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
@@ -260,18 +261,18 @@ pzReDistribute_B_to_X(doublecomplex *B,
 		k = ptr_to_ibuf[p];
 		send_ibuf[k] = irow;
 		++ptr_to_ibuf[p];
-		
+
 		k = ptr_to_dbuf[p];
 		RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
 			send_dbuf[k++] = B[i + j*ldb];
 		}
 		ptr_to_dbuf[p] += nrhs;
 		}
-		
+
 		// t = SuperLU_timer_() - t;
-		// printf(".. copy to send buffer time\t%8.4f\n", t);	
+		// printf(".. copy to send buffer time\t%8.4f\n", t);
 
-#if 0	
+#if 0
 	#if 1
 		/* Communicate the (permuted) row indices. */
 		MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
@@ -280,17 +281,17 @@ pzReDistribute_B_to_X(doublecomplex *B,
 		MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
 			  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
 			  grid->comm);
-	#else	
+	#else
  		/* Communicate the (permuted) row indices. */
 		MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
 				recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm, &req_i);
  		/* Communicate the numerical values. */
 		MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
 				recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
-				grid->comm, &req_d);	
+				grid->comm, &req_d);
 		MPI_Wait(&req_i,&status);
 		MPI_Wait(&req_d,&status);
- 	#endif	 
+ 	#endif
 #endif
 	MPI_Barrier( grid->comm );
 
@@ -304,7 +305,7 @@ pzReDistribute_B_to_X(doublecomplex *B,
 		ppr = grid->iam-1+pp;
 		if(ppr>=procs)ppr-=procs;
 		if(ppr<0)ppr+=procs;
-		
+
 		if(SendCnt[pps]>0){
 			MPI_Isend(&send_ibuf[sdispls[pps]], SendCnt[pps], mpi_int_t, pps, 0, grid->comm,
 			&req_send[Nreq_send] );
@@ -314,7 +315,7 @@ pzReDistribute_B_to_X(doublecomplex *B,
 			MPI_Irecv(&recv_ibuf[rdispls[ppr]], RecvCnt[ppr], mpi_int_t, ppr, 0, grid->comm,
 			&req_recv[Nreq_recv] );
 			Nreq_recv++;
-		}		
+		}
 	}
 
 
@@ -323,7 +324,7 @@ pzReDistribute_B_to_X(doublecomplex *B,
 
 
 	Nreq_send=0;
-	Nreq_recv=0;	
+	Nreq_recv=0;
 	for (pp=0;pp<procs;pp++){
 		pps = grid->iam+1+pp;
 		if(pps>=procs)pps-=procs;
@@ -340,17 +341,17 @@ pzReDistribute_B_to_X(doublecomplex *B,
 			MPI_Irecv(&recv_dbuf[rdispls_nrhs[ppr]], RecvCnt_nrhs[ppr], SuperLU_MPI_DOUBLE_COMPLEX, ppr, 1, grid->comm,
 			&req_recv[Nreq_recv] );
 			Nreq_recv++;
-		}		
+		}
 	}
 
 	if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send);
 	if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv);
 
-	
+
 		/* ------------------------------------------------------------
 		   Copy buffer into X on the diagonal processes.
 		   ------------------------------------------------------------*/
-		
+
 		// t = SuperLU_timer_();
 		ii = 0;
 		for (p = 0; p < procs; ++p) {
@@ -365,7 +366,7 @@ pzReDistribute_B_to_X(doublecomplex *B,
 			l = X_BLK( lk );
 			x[l - XK_H].r = k; /* Block number prepended in the header. */
 			x[l - XK_H].i = 0;
-			
+
 			irow = irow - FstBlockC(k); /* Relative row number in X-block */
 			RHS_ITERATE(j) {
 				x[l + irow + j*knsupc] = recv_dbuf[jj++];
@@ -375,17 +376,17 @@ pzReDistribute_B_to_X(doublecomplex *B,
 		}
 
 		// t = SuperLU_timer_() - t;
-		// printf(".. copy to x time\t%8.4f\n", t);	
-		
+		// printf(".. copy to x time\t%8.4f\n", t);
+
 		SUPERLU_FREE(send_ibuf);
 		SUPERLU_FREE(send_dbuf);
 		SUPERLU_FREE(req_send);
 		SUPERLU_FREE(req_recv);
 		SUPERLU_FREE(status_send);
-		SUPERLU_FREE(status_recv);	
-	}  
+		SUPERLU_FREE(status_recv);
+	}
+
 
-    
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()");
 #endif
@@ -428,7 +429,7 @@ pzReDistribute_X_to_B(int_t n, doublecom
 	MPI_Request req_i, req_d, *req_send, *req_recv;
 	MPI_Status status, *status_send, *status_recv;
 	int Nreq_recv, Nreq_send, pp,pps,ppr;
-	
+
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(grid->iam, "Enter pzReDistribute_X_to_B()");
 #endif
@@ -441,7 +442,7 @@ pzReDistribute_X_to_B(int_t n, doublecom
     nsupers = Glu_persist->supno[n-1] + 1;
     iam = grid->iam;
     procs = grid->nprow * grid->npcol;
- 
+
     SendCnt      = gstrs_comm->X_to_B_SendCnt;
     SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt +   procs;
     RecvCnt      = gstrs_comm->X_to_B_SendCnt + 2*procs;
@@ -453,9 +454,9 @@ pzReDistribute_X_to_B(int_t n, doublecom
     ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
     ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
 
-	
+
 	if(procs==1){ //faster memory copy when procs=1
-		
+
 #ifdef _OPENMP
 #pragma omp parallel default (shared)
 #endif
@@ -463,12 +464,12 @@ pzReDistribute_X_to_B(int_t n, doublecom
 #ifdef _OPENMP
 #pragma omp master
 #endif
-	{	
+	{
 		// t = SuperLU_timer_();
 #ifdef _OPENMP
-#pragma	omp	taskloop private (k,knsupc,lk,irow,l,i,j) untied 
-#endif		
-		for (k = 0; k < nsupers; k++) { 
+#pragma	omp	taskloop private (k,knsupc,lk,irow,l,i,j) untied
+#endif
+		for (k = 0; k < nsupers; k++) {
 		knsupc = SuperSize( k );
 		lk = LBi( k, grid ); /* Local block number */
 		irow = FstBlockC( k );
@@ -480,7 +481,7 @@ pzReDistribute_X_to_B(int_t n, doublecom
 			}
 		}
 	}
-	}	
+	}
 	}else{
 		k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
 		l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
@@ -490,13 +491,13 @@ pzReDistribute_X_to_B(int_t n, doublecom
 		if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) )
 			ABORT("Malloc fails for send_dbuf[].");
 		if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
-			ABORT("Malloc fails for req_send[].");	
+			ABORT("Malloc fails for req_send[].");
 		if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
 			ABORT("Malloc fails for req_recv[].");
 		if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
 			ABORT("Malloc fails for status_send[].");
 		if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
-			ABORT("Malloc fails for status_recv[].");	    
+			ABORT("Malloc fails for status_recv[].");
 		recv_dbuf = send_dbuf + k * nrhs;
 		for (p = 0; p < procs; ++p) {
 			ptr_to_ibuf[p] = sdispls[p];
@@ -532,26 +533,26 @@ pzReDistribute_X_to_B(int_t n, doublecom
 			}
 		}
 		}
-		
+
 		/* ------------------------------------------------------------
 			COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES.
 		   ------------------------------------------------------------*/
-#if 0	
+#if 0
 	#if 1
 		MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
 			  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
-		MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs,SuperLU_MPI_DOUBLE_COMPLEX, 
+		MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs,SuperLU_MPI_DOUBLE_COMPLEX,
 			  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
 			  grid->comm);
 	#else
 		MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
 				recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm,&req_i);
-		MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, 
+		MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
 				recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
 				grid->comm,&req_d);
  		MPI_Wait(&req_i,&status);
-		MPI_Wait(&req_d,&status);		 
-	#endif	
+		MPI_Wait(&req_d,&status);
+	#endif
 #endif
 
 	MPI_Barrier( grid->comm );
@@ -573,7 +574,7 @@ pzReDistribute_X_to_B(int_t n, doublecom
 			MPI_Irecv(&recv_ibuf[rdispls[ppr]], RecvCnt[ppr], mpi_int_t, ppr, 0, grid->comm,
 			&req_recv[Nreq_recv] );
 			Nreq_recv++;
-		}	
+		}
 	}
 
 
@@ -599,15 +600,15 @@ pzReDistribute_X_to_B(int_t n, doublecom
 			MPI_Irecv(&recv_dbuf[rdispls_nrhs[ppr]], RecvCnt_nrhs[ppr], SuperLU_MPI_DOUBLE_COMPLEX, ppr, 1, grid->comm,
 			&req_recv[Nreq_recv] );
 			Nreq_recv++;
-		}	
+		}
 	}
 
 
 	if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send);
 	if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv);
 	// MPI_Barrier( grid->comm );
-		
-	
+
+
 		/* ------------------------------------------------------------
 		   COPY THE BUFFER INTO B.
 		   ------------------------------------------------------------*/
@@ -624,7 +625,7 @@ pzReDistribute_X_to_B(int_t n, doublecom
 	SUPERLU_FREE(req_send);
 	SUPERLU_FREE(req_recv);
 	SUPERLU_FREE(status_send);
-	SUPERLU_FREE(status_recv);	
+	SUPERLU_FREE(status_recv);
 }
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(grid->iam, "Exit pzReDistribute_X_to_B()");
@@ -676,18 +677,18 @@ pzCompute_Diag_Inv(int_t n, LUstruct_t *
 
     doublecomplex one = {1.0, 0.0};
     doublecomplex zero = {0.0, 0.0};
-	
+
 #if ( PROFlevel>=1 )
     t = SuperLU_timer_();
-#endif 
+#endif
 
-#if ( PRNTlevel>=1 )
+#if ( PRNTlevel>=2 )
     if ( grid->iam==0 ) {
 	printf("computing inverse of diagonal blocks...\n");
 	fflush(stdout);
     }
 #endif
-	
+
     /*
      * Initialization.
      */
@@ -704,7 +705,7 @@ pzCompute_Diag_Inv(int_t n, LUstruct_t *
     Uinv_bc_ptr = Llu->Uinv_bc_ptr;
     Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
     nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
-    
+
     Llu->inv = 1;
 
     /*---------------------------------------------------
@@ -723,19 +724,19 @@ pzCompute_Diag_Inv(int_t n, LUstruct_t *
 		  lusup = Lnzval_bc_ptr[lk];
 		  Linv = Linv_bc_ptr[lk];
 		  Uinv = Uinv_bc_ptr[lk];
-		  nsupr = lsub[1];	
+		  nsupr = lsub[1];
 		  knsupc = SuperSize( k );
 
 		  for (j=0 ; j<knsupc; j++){
 		      for (i=0 ; i<knsupc; i++){
-		  	  Linv[j*knsupc+i] = zero;	
-			  Uinv[j*knsupc+i] = zero;	
+		  	  Linv[j*knsupc+i] = zero;
+			  Uinv[j*knsupc+i] = zero;
 		      }
 	          }
-				
+
 	   	  for (j=0 ; j<knsupc; j++){
 		      Linv[j*knsupc+j] = one;
-		      for (i=j+1 ; i<knsupc; i++){	
+		      for (i=j+1 ; i<knsupc; i++){
 		  	  z_copy(&Linv[j*knsupc+i],&lusup[j*nsupr+i]);
 		      }
 
@@ -759,7 +760,7 @@ pzCompute_Diag_Inv(int_t n, LUstruct_t *
 	printf(".. L-diag_inv time\t%10.5f\n", t);
 	fflush(stdout);
     }
-#endif	
+#endif
 
     return;
 #endif /* SLU_HAVE_LAPACK */
@@ -781,7 +782,7 @@ pzCompute_Diag_Inv(int_t n, LUstruct_t *
  * and the linear system solved is
  *     A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and
  * the permutation to B1 by Pc*Pr is applied internally in this routine.
- * 
+ *
  * Arguments
  * =========
  *
@@ -822,7 +823,7 @@ pzCompute_Diag_Inv(int_t n, LUstruct_t *
  *
  * nrhs   (input) int (global)
  *        Number of right-hand sides.
- * 
+ *
  * SOLVEstruct (input) SOLVEstruct_t* (global)
  *        Contains the information for the communication during the
  *        solution phase.
@@ -834,11 +835,11 @@ pzCompute_Diag_Inv(int_t n, LUstruct_t *
  * info   (output) int*
  * 	   = 0: successful exit
  *	   < 0: if info = -i, the i-th argument had an illegal value
- * </pre>       
+ * </pre>
  */
 
 void
-pzgstrs(int_t n, LUstruct_t *LUstruct, 
+pzgstrs(int_t n, LUstruct_t *LUstruct,
 	ScalePermstruct_t *ScalePermstruct,
 	gridinfo_t *grid, doublecomplex *B,
 	int_t m_loc, int_t fst_row, int_t ldb, int nrhs,
@@ -859,7 +860,7 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     doublecomplex *rtemp, *rtemp_loc; /* Result of full matrix-vector multiply. */
     doublecomplex *Linv; /* Inverse of diagonal block */
     doublecomplex *Uinv; /* Inverse of diagonal block */
-    int *ipiv; 
+    int *ipiv;
     int_t *leaf_send;
     int_t nleaf_send, nleaf_send_tmp;
     int_t *root_send;
@@ -869,8 +870,8 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     BcTree  *LBtree_ptr = Llu->LBtree_ptr;
     RdTree  *LRtree_ptr = Llu->LRtree_ptr;
     BcTree  *UBtree_ptr = Llu->UBtree_ptr;
-    RdTree  *URtree_ptr = Llu->URtree_ptr;	
-    int_t  *Urbs1, *Urbs2; /* Number of row blocks in each block column of U. */
+    RdTree  *URtree_ptr = Llu->URtree_ptr;
+    int_t  *Urbs1; /* Number of row blocks in each block column of U. */
     int_t  *Urbs = Llu->Urbs; /* Number of row blocks in each block column of U. */
     Ucb_indptr_t **Ucb_indptr = Llu->Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
     int_t  **Ucb_valptr = Llu->Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
@@ -890,7 +891,6 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     doublecomplex **Uinv_bc_ptr;
     doublecomplex sum;
     MPI_Status status,status_on,statusx,statuslsum;
-    MPI_Request *send_req, recv_req, req;
     pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
     SuperLUStat_t **stat_loc;
 
@@ -902,9 +902,9 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     int_t fmod_tmp;
     int_t  **fsendx_plist = Llu->fsendx_plist;
     int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  nfrecvx_buf=0;						 	    			 
+    int_t  nfrecvx_buf=0;
     int_t  *frecv;        /* Count of lsum[lk] contributions to be received
-    			 from processes in this row. 
+    			 from processes in this row.
     			 It is only valid on the diagonal processes. */
     int_t  frecv_tmp;
     int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
@@ -918,12 +918,12 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     int_t  bmod_tmp;
     int_t  **bsendx_plist = Llu->bsendx_plist;
     int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  nbrecvx_buf=0;		
+    int_t  nbrecvx_buf=0;
     int_t  *brecv;        /* Count of modifications to be recv'd from
     			 processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     int_t flagx,flaglsum,flag;
-    int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups, *rootsups; 
+    int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups, *rootsups;
     int_t TAG;
     double t1_sol, t2_sol, t;
 #if ( DEBUGlevel>=2 )
@@ -931,7 +931,7 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
 #endif
 
     int_t gik,iklrow,fnz;
-    
+
     int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
     int INFO, pad;
     int_t tmpresult;
@@ -939,7 +939,7 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     // #if ( PROFlevel>=1 )
     double t1, t2;
     float msg_vol = 0, msg_cnt = 0;
-    // #endif 
+    // #endif
 
     int_t msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
 		      *     0 : transferred in Lsub_buf[]
@@ -948,14 +948,14 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
 		      *     3 : transferred in Uval_buf[]
 		      */
     int iword = sizeof (int_t);
-    int dword = sizeof (double);	
+    int dword = sizeof (double);
     int Nwork;
 	int_t procs = grid->nprow * grid->npcol;
     	yes_no_t done;
     yes_no_t startforward;
     	int nbrow;
     int_t  ik, rel, idx_r, jb, nrbl, irow, pc,iknsupc;
-    int_t  lptr1_tmp, idx_i, idx_v,m; 
+    int_t  lptr1_tmp, idx_i, idx_v,m;
     	int_t ready;
     	static int thread_id;
     yes_no_t empty;
@@ -963,10 +963,10 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     aln_d = ceil(CACHELINE/(double)dword);
     aln_i = ceil(CACHELINE/(double)iword);
     int num_thread = 1;
-	
+
 	maxsuper = sp_ienv_dist(3);
-	
-#ifdef _OPENMP	
+
+#ifdef _OPENMP
 	#pragma omp threadprivate(thread_id)
 #endif
 
@@ -986,7 +986,7 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
 	fflush(stdout);
     }
 #endif
-	
+
     MPI_Barrier( grid->comm );
     t1_sol = SuperLU_timer_();
     t = SuperLU_timer_();
@@ -999,7 +999,7 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
 	pxerr_dist("PZGSTRS", grid, -*info);
 	return;
     }
-	
+
     /*
      * Initialization.
      */
@@ -1014,14 +1014,14 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
     Linv_bc_ptr = Llu->Linv_bc_ptr;
-    Uinv_bc_ptr = Llu->Uinv_bc_ptr;	
+    Uinv_bc_ptr = Llu->Uinv_bc_ptr;
     nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
 
     stat->utime[SOL_COMM] = 0.0;
     stat->utime[SOL_GEMM] = 0.0;
     stat->utime[SOL_TRSM] = 0.0;
-    stat->utime[SOL_TOT] = 0.0;	
-	
+    stat->utime[SOL_TOT] = 0.0;
+
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter pzgstrs()");
 #endif
@@ -1061,39 +1061,39 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
     sizelsum = (((size_t)ldalsum)*nrhs + nlb*LSUM_H);
     sizelsum = ((sizelsum + (aln_d - 1)) / aln_d) * aln_d;
-	
+
 #ifdef _OPENMP
     if ( !(lsum = (doublecomplex*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(doublecomplex))))
-	ABORT("Malloc fails for lsum[].");	
+	ABORT("Malloc fails for lsum[].");
 #pragma omp parallel default(shared) private(ii)
     {
 	for (ii=0; ii<sizelsum; ii++)
     	lsum[thread_id*sizelsum+ii]=zero;
     }
-#else	
+#else
     if ( !(lsum = (doublecomplex*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(doublecomplex))))
   	    ABORT("Malloc fails for lsum[].");
     for ( ii=0; ii < sizelsum*num_thread; ii++ )
-	lsum[ii]=zero;		
-#endif	
-    if ( !(x = (doublecomplex*)SUPERLU_MALLOC((ldalsum * nrhs + nlb * XK_H) * sizeof(doublecomplex))) ) 	
+	lsum[ii]=zero;
+#endif
+    if ( !(x = (doublecomplex*)SUPERLU_MALLOC((ldalsum * nrhs + nlb * XK_H) * sizeof(doublecomplex))) )
 	ABORT("Calloc fails for x[].");
-    
-	
+
+
     sizertemp=ldalsum * nrhs;
     sizertemp = ((sizertemp + (aln_d - 1)) / aln_d) * aln_d;
     if ( !(rtemp = (doublecomplex*)SUPERLU_MALLOC((sizertemp*num_thread + 1) * sizeof(doublecomplex))) )
-	ABORT("Malloc fails for rtemp[].");		
+	ABORT("Malloc fails for rtemp[].");
 #ifdef _OPENMP
 #pragma omp parallel default(shared) private(ii)
     {
 	for ( ii=0; ii<sizertemp; ii++ )
-		rtemp[thread_id*sizertemp+ii]=zero;			
+		rtemp[thread_id*sizertemp+ii]=zero;
     }
-#else	
+#else
     for ( ii=0; ii<sizertemp*num_thread; ii++ )
-	rtemp[ii]=zero;			
-#endif	
+	rtemp[ii]=zero;
+#endif
 
     if ( !(stat_loc = (SuperLUStat_t**) SUPERLU_MALLOC(num_thread*sizeof(SuperLUStat_t*))) )
 	ABORT("Malloc fails for stat_loc[].");
@@ -1103,7 +1103,7 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
 	PStatInit(stat_loc[i]);
     }
 
-#if ( DEBUGlevel>=2 )   
+#if ( DEBUGlevel>=2 )
     /* Dump the L factor using matlab triple-let format. */
     zDumpLblocks(iam, nsupers, grid, Glu_persist, Llu);
 #endif
@@ -1112,7 +1112,7 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
      * Forward solve Ly = b.
      *---------------------------------------------------*/
     /* Redistribute B into X on the diagonal processes. */
-    pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, 
+    pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x,
 			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
 
 #if ( PRNTlevel>=2 )
@@ -1120,12 +1120,12 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     if ( !iam) printf(".. B to X redistribute time\t%8.4f\n", t);
     fflush(stdout);
     t = SuperLU_timer_();
-#endif	
+#endif
 
     /* Set up the headers in lsum[]. */
-#ifdef _OPENMP	
+#ifdef _OPENMP
 	#pragma omp simd lastprivate(krow,lk,il)
-#endif		
+#endif
     for (k = 0; k < nsupers; ++k) {
 	krow = PROW( k, grid );
 	if ( myrow == krow ) {
@@ -1138,16 +1138,16 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
 
 	/* ---------------------------------------------------------
 	   Initialize the async Bcast trees on all processes.
-	   --------------------------------------------------------- */		
+	   --------------------------------------------------------- */
 	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
 
 	nbtree = 0;
 	for (lk=0;lk<nsupers_j;++lk){
 		if(LBtree_ptr[lk]!=NULL){
-			// printf("LBtree_ptr lk %5d\n",lk); 
-			if(BcTree_IsRoot(LBtree_ptr[lk],'z')==NO){			
+			// printf("LBtree_ptr lk %5d\n",lk);
+			if(BcTree_IsRoot(LBtree_ptr[lk],'z')==NO){
 				nbtree++;
-				if(BcTree_getDestCount(LBtree_ptr[lk],'z')>0)nfrecvx_buf++;				  
+				if(BcTree_getDestCount(LBtree_ptr[lk],'z')>0)nfrecvx_buf++;
 			}
 			BcTree_allocateRequest(LBtree_ptr[lk],'z');
 		}
@@ -1160,24 +1160,24 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
 	nrtree = 0;
 	nleaf=0;
 	nfrecvmod=0;
-	
-	
-	
+
+
+
 if(procs==1){
 	for (lk=0;lk<nsupers_i;++lk){
 		gb = myrow+lk*grid->nprow;  /* not sure */
 		if(gb<nsupers){
 			if (fmod[lk*aln_i]==0){
-				leafsups[nleaf]=gb;				
+				leafsups[nleaf]=gb;
 				++nleaf;
 			}
 		}
 	}
-}else{	
+}else{
 	for (lk=0;lk<nsupers_i;++lk){
 		if(LRtree_ptr[lk]!=NULL){
 			nrtree++;
-			RdTree_allocateRequest(LRtree_ptr[lk],'z');			
+			RdTree_allocateRequest(LRtree_ptr[lk],'z');
 			frecv[lk] = RdTree_GetDestCount(LRtree_ptr[lk],'z');
 			nfrecvmod += frecv[lk];
 		}else{
@@ -1186,27 +1186,29 @@ if(procs==1){
 				kcol = PCOL( gb, grid );
 				if(mycol==kcol) { /* Diagonal process */
 					if (fmod[lk*aln_i]==0){
-						leafsups[nleaf]=gb;				
+						leafsups[nleaf]=gb;
 						++nleaf;
 					}
 				}
 			}
 		}
-	}	
-}	
-	
-	
-#ifdef _OPENMP	
+	}
+}
+
+
+#ifdef _OPENMP
 #pragma omp simd
 #endif
 	for (i = 0; i < nlb; ++i) fmod[i*aln_i] += frecv[i];
 
 	if ( !(recvbuf_BC_fwd = (doublecomplex*)SUPERLU_MALLOC(maxrecvsz*(nfrecvx+1) * sizeof(doublecomplex))) )  // this needs to be optimized for 1D row mapping
-		ABORT("Malloc fails for recvbuf_BC_fwd[].");	
-	nfrecvx_buf=0;			
+		ABORT("Malloc fails for recvbuf_BC_fwd[].");
+	nfrecvx_buf=0;
+
+	log_memory(nlb*aln_i*iword+nlb*iword+(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*2.0*iword+ nsupers_i*iword + sizelsum*num_thread * dword*2.0 + (ldalsum * nrhs + nlb * XK_H) *dword*2.0 + (sizertemp*num_thread + 1)*dword*2.0+maxrecvsz*(nfrecvx+1)*dword*2.0, stat);	//account for fmod, frecv, leaf_send, root_send, leafsups, recvbuf_BC_fwd	, lsum, x, rtemp
+
+
 
-	log_memory(nlb*aln_i*iword+nlb*iword+(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*2.0*iword+ nsupers_i*iword + sizelsum*num_thread * dword + (ldalsum * nrhs + nlb * XK_H) *dword + (sizertemp*num_thread + 1)*dword+maxrecvsz*(nfrecvx+1)*dword, stat);	//account for fmod, frecv, leaf_send, root_send, leafsups, recvbuf_BC_fwd	, lsum, x, rtemp
-	
 #if ( DEBUGlevel>=2 )
 	printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n,  nbtree %4d\n,  nrtree %4d\n",
 			iam, nfrecvx, nfrecvmod, nleaf, nbtree, nrtree);
@@ -1217,13 +1219,13 @@ if(procs==1){
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Setup L-solve time\t%8.4f\n", t);
 	fflush(stdout);
-	MPI_Barrier( grid->comm );	
+	MPI_Barrier( grid->comm );
 	t = SuperLU_timer_();
 #endif
 
 #if ( VAMPIR>=1 )
-	// VT_initialize(); 
-	VT_traceon();	
+	// VT_initialize();
+	VT_traceon();
 #endif
 
 #ifdef USE_VTUNE
@@ -1241,27 +1243,27 @@ if(procs==1){
 
 
 #ifdef _OPENMP
-#pragma omp parallel default (shared) 
+#pragma omp parallel default (shared)
 #endif
-	{	
+	{
 		{
-		
+
             if (Llu->inv == 1) { /* Diagonal is inverted. */
 
 #ifdef _OPENMP
-#pragma	omp	for firstprivate(nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait	
+#pragma	omp	for firstprivate(nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait
 #endif
 			for (jj=0;jj<nleaf;jj++){
 				k=leafsups[jj];
 
 				// #ifdef _OPENMP
-				// #pragma	omp	task firstprivate (k,nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,knsupc,lk,luptr,lsub,nsupr,lusup,thread_id,t1,t2,Linv,i,lib,rtemp_loc)	 	
+				// #pragma	omp	task firstprivate (k,nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,knsupc,lk,luptr,lsub,nsupr,lusup,thread_id,t1,t2,Linv,i,lib,rtemp_loc)
 				// #endif
 				{
 
 #if ( PROFlevel>=1 )
 					TIC(t1);
-#endif	 
+#endif
 					rtemp_loc = &rtemp[sizertemp* thread_id];
 
 
@@ -1288,15 +1290,15 @@ if(procs==1){
 					zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
 							&alpha, Linv, &knsupc, &x[ii],
 							&knsupc, &beta, rtemp_loc, &knsupc );
-#endif	
+#endif
 
 				#ifdef _OPENMP
 					#pragma omp simd
-				#endif		   
+				#endif
 					for (i=0 ; i<knsupc*nrhs ; i++){
 						z_copy(&x[ii+i],&rtemp_loc[i]);
-					}		
-					
+					}
+
 					// for (i=0 ; i<knsupc*nrhs ; i++){
 					// printf("x_l: %f %f\n",x[ii+i].r,x[ii+i].i);
 					// fflush(stdout);
@@ -1307,12 +1309,12 @@ if(procs==1){
 					TOC(t2, t1);
 					stat_loc[thread_id]->utime[SOL_TRSM] += t2;
 
-#endif	
+#endif
 
 					stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
 					+ 10 * knsupc * nrhs; /* complex division */
-			
-					
+
+
 					// --nleaf;
 #if ( DEBUGlevel>=2 )
 					printf("(%2d) Solve X[%2d]\n", iam, k);
@@ -1322,9 +1324,9 @@ if(procs==1){
 					 * Send Xk to process column Pc[k].
 					 */
 
-					if(LBtree_ptr[lk]!=NULL){ 
+					if(LBtree_ptr[lk]!=NULL){
 						lib = LBi( k, grid ); /* Local block number, row-wise. */
-						ii = X_BLK( lib );	
+						ii = X_BLK( lib );
 
 #ifdef _OPENMP
 #pragma omp atomic capture
@@ -1333,11 +1335,11 @@ if(procs==1){
 						leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
 						// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z');
 					}
-				}		
+				}
 			}
 	} else { /* Diagonal is not inverted. */
 #ifdef _OPENMP
-#pragma	omp	for firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait	
+#pragma	omp	for firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait
 #endif
 	    for (jj=0;jj<nleaf;jj++) {
 		k=leafsups[jj];
@@ -1345,7 +1347,7 @@ if(procs==1){
 
 #if ( PROFlevel>=1 )
 		    TIC(t1);
-#endif	 
+#endif
 		    rtemp_loc = &rtemp[sizertemp* thread_id];
 
 		    knsupc = SuperSize( k );
@@ -1362,13 +1364,13 @@ if(procs==1){
    		    CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 				lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-		    ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-				lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);	
+		    ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+				lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
- 		    ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+ 		    ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 					lusup, &nsupr, &x[ii], &knsupc);
 #endif
-				
+
 		// for (i=0 ; i<knsupc*nrhs ; i++){
 		// printf("x_l: %f %f\n",x[ii+i].r,x[ii+i].i);
 		// fflush(stdout);
@@ -1379,11 +1381,11 @@ if(procs==1){
 		    TOC(t2, t1);
 		    stat_loc[thread_id]->utime[SOL_TRSM] += t2;
 
-#endif	
+#endif
 
 		    stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
 				+ 10 * knsupc * nrhs; /* complex division */
-			
+
 		    // --nleaf;
 #if ( DEBUGlevel>=2 )
 		    printf("(%2d) Solve X[%2d]\n", iam, k);
@@ -1393,9 +1395,9 @@ if(procs==1){
 		     * Send Xk to process column Pc[k].
 		     */
 
-		    if (LBtree_ptr[lk]!=NULL) { 
+		    if (LBtree_ptr[lk]!=NULL) {
 			lib = LBi( k, grid ); /* Local block number, row-wise. */
-			ii = X_BLK( lib );	
+			ii = X_BLK( lib );
 
 #ifdef _OPENMP
 #pragma omp atomic capture
@@ -1403,10 +1405,10 @@ if(procs==1){
 			nleaf_send_tmp = ++nleaf_send;
 			leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
 		    }
-		    } /* end a block */		
+		    } /* end a block */
 		} /* end for jj ... */
 	    } /* end else ... diagonal is not invedted */
-	  }	
+	  }
 	}
 
 	jj=0;
@@ -1427,7 +1429,7 @@ if(procs==1){
 #endif
 
 					for (jj=0;jj<nleaf;jj++){
-						k=leafsups[jj];		
+						k=leafsups[jj];
 
 						{
 							/* Diagonal process */
@@ -1437,8 +1439,8 @@ if(procs==1){
 							 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
 							 */
 							zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, k,
-									fmod, xsup, grid, Llu, 
-									stat_loc, leaf_send, &nleaf_send,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);	
+									fmod, xsup, grid, Llu,
+									stat_loc, leaf_send, &nleaf_send,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
 						}
 
 						// } /* if diagonal process ... */
@@ -1452,7 +1454,7 @@ if(procs==1){
 				if(lk>=0){ // this is a bcast forwarding
 					gb = mycol+lk*grid->npcol;  /* not sure */
 					lib = LBi( gb, grid ); /* Local block number, row-wise. */
-					ii = X_BLK( lib );			
+					ii = X_BLK( lib );
 					BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'z')*nrhs+XK_H,'z');
 				}else{ // this is a reduce forwarding
 					lk = -lk - 1;
@@ -1473,25 +1475,25 @@ if(procs==1){
 			   ----------------------------------------------------------- */
 
 #ifdef _OPENMP
-#pragma omp parallel default (shared) 
+#pragma omp parallel default (shared)
 #endif
-			{	
+			{
 #ifdef _OPENMP
-#pragma omp master 
+#pragma omp master
 #endif
-				{									 
+				{
 					for ( nfrecv =0; nfrecv<nfrecvx+nfrecvmod;nfrecv++) { /* While not finished. */
 						thread_id = 0;
 #if ( PROFlevel>=1 )
 						TIC(t1);
 						// msgcnt[1] = maxrecvsz;
-#endif	
+#endif
 
 						recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz];
 
 						/* Receive a message. */
 						MPI_Recv( recvbuf0, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-								MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );	 	
+								MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
 						// MPI_Irecv(recvbuf0,maxrecvsz,SuperLU_MPI_DOUBLE_COMPLEX,MPI_ANY_SOURCE,MPI_ANY_TAG,grid->comm,&req);
 						// ready=0;
 						// while(ready==0){
@@ -1499,18 +1501,18 @@ if(procs==1){
 						// #pragma omp taskyield
 						// }
 
-#if ( PROFlevel>=1 )		 
+#if ( PROFlevel>=1 )
 						TOC(t2, t1);
 						stat_loc[thread_id]->utime[SOL_COMM] += t2;
 
 						msg_cnt += 1;
-						msg_vol += maxrecvsz * dword;			
-#endif					  
+						msg_vol += maxrecvsz * dword;
+#endif
+
+						{
 
-						{  
-							
 							k = (*recvbuf0).r;
-		
+
 #if ( DEBUGlevel>=2 )
 							printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
 #endif
@@ -1523,13 +1525,13 @@ if(procs==1){
 
 									if(BcTree_getDestCount(LBtree_ptr[lk],'z')>0){
 
-										BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'z')*nrhs+XK_H,'z');	
+										BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'z')*nrhs+XK_H,'z');
 										// nfrecvx_buf++;
 									}
 
 									/*
 									 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-									 */	  
+									 */
 
 									lk = LBj( k, grid ); /* Local block number, column-wise. */
 									lsub = Lrowind_bc_ptr[lk];
@@ -1544,30 +1546,30 @@ if(procs==1){
 										}else{
 											nb   = lsub[0];
 											knsupc = SuperSize( k );
-											xin = &recvbuf0[XK_H] ;					
+											xin = &recvbuf0[XK_H] ;
 										}
 
 										zlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k,
 												fmod, nb, xsup, grid, Llu,
-												stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);	
+												stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
 
 									} /* if lsub */
 								}
 
 							}else if(status.MPI_TAG==RD_L){
-								// --nfrecvmod;		  
+								// --nfrecvmod;
 								lk = LBi( k, grid ); /* Local block number, row-wise. */
 
 								knsupc = SuperSize( k );
 								tempv = &recvbuf0[LSUM_H];
-								il = LSUM_BLK( lk );		  
+								il = LSUM_BLK( lk );
 								RHS_ITERATE(j) {
 									for (i = 0; i < knsupc; ++i)
 										z_add(&lsum[i + il + j*knsupc + thread_id*sizelsum],
 											  &lsum[i + il + j*knsupc + thread_id*sizelsum],
 											  &tempv[i + j*knsupc]);
-										
-								}			
+
+								}
 
 								// #ifdef _OPENMP
 								// #pragma omp atomic capture
@@ -1576,14 +1578,14 @@ if(procs==1){
 								{
 									thread_id = 0;
 									rtemp_loc = &rtemp[sizertemp* thread_id];
-									if ( fmod_tmp==0 ) {	  
+									if ( fmod_tmp==0 ) {
 										if(RdTree_IsRoot(LRtree_ptr[lk],'z')==YES){
 											// ii = X_BLK( lk );
 											knsupc = SuperSize( k );
 											for (ii=1;ii<num_thread;ii++)
 											#ifdef _OPENMP
 												#pragma omp simd
-											#endif	
+											#endif
 												for (jj=0;jj<knsupc*nrhs;jj++)
 													z_add(&lsum[il + jj ],
 														  &lsum[il + jj ],
@@ -1593,8 +1595,8 @@ if(procs==1){
 											RHS_ITERATE(j)
 												#ifdef _OPENMP
 													#pragma omp simd
-												#endif												
-												for (i = 0; i < knsupc; ++i)	
+												#endif
+												for (i = 0; i < knsupc; ++i)
 													z_add(&x[i + ii + j*knsupc],
 														  &x[i + ii + j*knsupc],
 														  &lsum[i + il + j*knsupc] );
@@ -1607,10 +1609,10 @@ if(procs==1){
 
 #if ( PROFlevel>=1 )
 											TIC(t1);
-#endif			  
+#endif
 
 											if(Llu->inv == 1){
-												Linv = Linv_bc_ptr[lk];		  
+												Linv = Linv_bc_ptr[lk];
 #ifdef _CRAY
 												CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
 														&alpha, Linv, &knsupc, &x[ii],
@@ -1623,23 +1625,23 @@ if(procs==1){
 												zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
 														&alpha, Linv, &knsupc, &x[ii],
 														&knsupc, &beta, rtemp_loc, &knsupc );
-#endif			   
+#endif
 												#ifdef _OPENMP
 													#pragma omp simd
 												#endif
 												for (i=0 ; i<knsupc*nrhs ; i++){
 													z_copy(&x[ii+i],&rtemp_loc[i]);
-												}		
+												}
 											}
 											else{
 #ifdef _CRAY
 												CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
 														lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-												ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-														lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);		
+												ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+														lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-												ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
+												ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
 														lusup, &nsupr, &x[ii], &knsupc);
 #endif
 											}
@@ -1647,7 +1649,7 @@ if(procs==1){
 #if ( PROFlevel>=1 )
 											TOC(t2, t1);
 											stat_loc[thread_id]->utime[SOL_TRSM] += t2;
-#endif	
+#endif
 
 											stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
 											+ 10 * knsupc * nrhs; /* complex division */
@@ -1657,10 +1659,10 @@ if(procs==1){
 
 											/*
 											 * Send Xk to process column Pc[k].
-											 */						  
-											if(LBtree_ptr[lk]!=NULL){ 
+											 */
+											if(LBtree_ptr[lk]!=NULL){
 												BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'z')*nrhs+XK_H,'z');
-											}		  
+											}
 
 
 											/*
@@ -1674,33 +1676,33 @@ if(procs==1){
 												nb = lsub[0] - 1;
 												knsupc = SuperSize( k );
 												ii = X_BLK( LBi( k, grid ) );
-												xin = &x[ii];		
+												xin = &x[ii];
 												zlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k,
 														fmod, nb, xsup, grid, Llu,
-														stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);	
+														stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
 											} /* if lsub */
 											// }
 
 									}else{
 
-										il = LSUM_BLK( lk );		  
+										il = LSUM_BLK( lk );
 										knsupc = SuperSize( k );
 
 										for (ii=1;ii<num_thread;ii++)
 											#ifdef _OPENMP
 												#pragma omp simd
-											#endif										
+											#endif
 											for (jj=0;jj<knsupc*nrhs;jj++)
 												z_add(&lsum[il + jj ],
 													  &lsum[il + jj ],
 													  &lsum[il + jj + ii*sizelsum]);
-										RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il-LSUM_H],RdTree_GetMsgSize(LRtree_ptr[lk],'z')*nrhs+LSUM_H,'z'); 
-									}  
+										RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il-LSUM_H],RdTree_GetMsgSize(LRtree_ptr[lk],'z')*nrhs+LSUM_H,'z');
+									}
 
 								}
 
-							}					
-						} /* check Tag */		  
+							}
+						} /* check Tag */
 					}
 
 				} /* while not finished ... */
@@ -1720,9 +1722,9 @@ if(procs==1){
 		MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE,
 				MPI_MAX, 0, grid->comm);
 		if ( !iam ) {
-			printf(".. L-solve time (MAX) \t%8.4f\n", tmax);	
+			printf(".. L-solve time (MAX) \t%8.4f\n", tmax);
 			fflush(stdout);
-		}	
+		}
 
 
 		t = SuperLU_timer_();
@@ -1753,29 +1755,28 @@ if(procs==1){
 		SUPERLU_FREE(leaf_send);
 		SUPERLU_FREE(leafsups);
 		SUPERLU_FREE(recvbuf_BC_fwd);
+		log_memory(-nlb*aln_i*iword-nlb*iword-(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword- nsupers_i*iword -maxrecvsz*(nfrecvx+1)*dword*2.0, stat);	//account for fmod, frecv, leaf_send, leafsups, recvbuf_BC_fwd
 
-		log_memory(-nlb*aln_i*iword-nlb*iword-(CEILING( nsupers, Pr )-CEILING( nsupers, Pc ))*aln_i*iword- nsupers_i*iword -maxrecvsz*(nfrecvx+1)*dword, stat);	//account for fmod, frecv, leaf_send, leafsups, recvbuf_BC_fwd				
-		
 		for (lk=0;lk<nsupers_j;++lk){
 			if(LBtree_ptr[lk]!=NULL){
-				// if(BcTree_IsRoot(LBtree_ptr[lk],'z')==YES){			
-				BcTree_waitSendRequest(LBtree_ptr[lk],'z');		
+				// if(BcTree_IsRoot(LBtree_ptr[lk],'z')==YES){
+				BcTree_waitSendRequest(LBtree_ptr[lk],'z');
 				// }
 				// deallocate requests here
 			}
 		}
 
 		for (lk=0;lk<nsupers_i;++lk){
-			if(LRtree_ptr[lk]!=NULL){		
-				RdTree_waitSendRequest(LRtree_ptr[lk],'z');		
+			if(LRtree_ptr[lk]!=NULL){
+				RdTree_waitSendRequest(LRtree_ptr[lk],'z');
 				// deallocate requests here
 			}
-		}		
+		}
 		MPI_Barrier( grid->comm );
 
-#if ( VAMPIR>=1 )	
-		VT_traceoff();	
-		VT_finalize(); 
+#if ( VAMPIR>=1 )
+		VT_traceoff();
+		VT_finalize();
 #endif
 
 
@@ -1785,8 +1786,8 @@ if(procs==1){
 		 * The Y components from the forward solve is already
 		 * on the diagonal processes.
 	 *---------------------------------------------------*/
-		 
-		 
+
+
 		/* Save the count to be altered so it can be used by
 		   subsequent call to PDGSTRS. */
 		if ( !(bmod = intMalloc_dist(nlb*aln_i)) )
@@ -1799,7 +1800,7 @@ if(procs==1){
 		k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
 
 		/* Re-initialize lsum to zero. Each block header is already in place. */
-		
+
 #ifdef _OPENMP
 
 #pragma omp parallel default(shared) private(ii)
@@ -1808,9 +1809,9 @@ if(procs==1){
 			lsum[thread_id*sizelsum+ii]=zero;
 	}
     /* Set up the headers in lsum[]. */
-#ifdef _OPENMP	
+#ifdef _OPENMP
 	#pragma omp simd lastprivate(krow,lk,il)
-#endif		
+#endif
     for (k = 0; k < nsupers; ++k) {
 	krow = PROW( k, grid );
 	if ( myrow == krow ) {
@@ -1819,9 +1820,9 @@ if(procs==1){
 	    lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/
 	    lsum[il - LSUM_H].i = 0;
 	}
-    }	
+    }
 
-#else	
+#else
 	for (k = 0; k < nsupers; ++k) {
 		krow = PROW( k, grid );
 		if ( myrow == krow ) {
@@ -1829,15 +1830,15 @@ if(procs==1){
 			lk = LBi( k, grid );
 			il = LSUM_BLK( lk );
 			dest = &lsum[il];
-			
-			for (jj = 0; jj < num_thread; ++jj) {						
+
+			for (jj = 0; jj < num_thread; ++jj) {
 				RHS_ITERATE(j) {
 					for (i = 0; i < knsupc; ++i) dest[i + j*knsupc + jj*sizelsum] = zero;
-				}	
-			}	
+				}
+			}
 		}
 	}
-#endif		
+#endif
 
 #if ( DEBUGlevel>=2 )
 		for (p = 0; p < Pr*Pc; ++p) {
@@ -1850,7 +1851,7 @@ if(procs==1){
 						for (i = 0; i < Urbs[lb]; ++i)
 							printf("(%2d) .. row blk %2d:\
 									lbnum %d, indpos %d, valpos %d\n",
-									iam, i, 
+									iam, i,
 									Ucb_indptr[lb][i].lbnum,
 									Ucb_indptr[lb][i].indpos,
 									Ucb_valptr[lb][i]);
@@ -1878,16 +1879,16 @@ if(procs==1){
 
 	/* ---------------------------------------------------------
 	   Initialize the async Bcast trees on all processes.
-	   --------------------------------------------------------- */		
+	   --------------------------------------------------------- */
 	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
 
 	nbtree = 0;
 	for (lk=0;lk<nsupers_j;++lk){
 		if(UBtree_ptr[lk]!=NULL){
-			// printf("UBtree_ptr lk %5d\n",lk); 
-			if(BcTree_IsRoot(UBtree_ptr[lk],'z')==NO){			
+			// printf("UBtree_ptr lk %5d\n",lk);
+			if(BcTree_IsRoot(UBtree_ptr[lk],'z')==NO){
 				nbtree++;
-				if(BcTree_getDestCount(UBtree_ptr[lk],'z')>0)nbrecvx_buf++;				  
+				if(BcTree_getDestCount(UBtree_ptr[lk],'z')>0)nbrecvx_buf++;
 			}
 			BcTree_allocateRequest(UBtree_ptr[lk],'z');
 		}
@@ -1904,7 +1905,7 @@ if(procs==1){
 			// printf("here lk %5d myid %5d\n",lk,iam);
 			// fflush(stdout);
 			nrtree++;
-			RdTree_allocateRequest(URtree_ptr[lk],'z');			
+			RdTree_allocateRequest(URtree_ptr[lk],'z');
 			brecv[lk] = RdTree_GetDestCount(URtree_ptr[lk],'z');
 			nbrecvmod += brecv[lk];
 		}else{
@@ -1913,27 +1914,27 @@ if(procs==1){
 				kcol = PCOL( gb, grid );
 				if(mycol==kcol) { /* Diagonal process */
 					if (bmod[lk*aln_i]==0){
-						rootsups[nroot]=gb;				
+						rootsups[nroot]=gb;
 						++nroot;
 					}
 				}
 			}
 		}
-	}	
+	}
 
-	#ifdef _OPENMP	
+	#ifdef _OPENMP
 	#pragma omp simd
 	#endif
 	for (i = 0; i < nlb; ++i) bmod[i*aln_i] += brecv[i];
 	// for (i = 0; i < nlb; ++i)printf("bmod[i]: %5d\n",bmod[i]);
-	
+
 
 	if ( !(recvbuf_BC_fwd = (doublecomplex*)SUPERLU_MALLOC(maxrecvsz*(nbrecvx+1) * sizeof(doublecomplex))) )  // this needs to be optimized for 1D row mapping
-		ABORT("Malloc fails for recvbuf_BC_fwd[].");	
-	nbrecvx_buf=0;			
+		ABORT("Malloc fails for recvbuf_BC_fwd[].");
+	nbrecvx_buf=0;
+
+	log_memory(nlb*aln_i*iword+nlb*iword + nsupers_i*iword + maxrecvsz*(nbrecvx+1)*dword*2.0, stat);	//account for bmod, brecv, rootsups, recvbuf_BC_fwd
 
-	log_memory(nlb*aln_i*iword+nlb*iword + nsupers_i*iword + maxrecvsz*(nbrecvx+1)*dword, stat);	//account for bmod, brecv, rootsups, recvbuf_BC_fwd	
-	
 #if ( DEBUGlevel>=2 )
 	printf("(%2d) nbrecvx %4d,  nbrecvmod %4d,  nroot %4d\n,  nbtree %4d\n,  nrtree %4d\n",
 			iam, nbrecvx, nbrecvmod, nroot, nbtree, nrtree);
@@ -1945,7 +1946,7 @@ if(procs==1){
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Setup U-solve time\t%8.4f\n", t);
 	fflush(stdout);
-	MPI_Barrier( grid->comm );	
+	MPI_Barrier( grid->comm );
 	t = SuperLU_timer_();
 #endif
 
@@ -1954,35 +1955,35 @@ if(procs==1){
 		 */
 #if ( DEBUGlevel>=2 )
 		printf("(%2d) nroot %4d\n", iam, nroot);
-		fflush(stdout);				
+		fflush(stdout);
 #endif
-		
-		
+
+
 
 #ifdef _OPENMP
-#pragma omp parallel default (shared) 
+#pragma omp parallel default (shared)
 #endif
-	{	
+	{
 #ifdef _OPENMP
 #pragma omp master
 #endif
 		{
 #ifdef _OPENMP
-#pragma	omp	taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp) nogroup		
-#endif		
+#pragma	omp	taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp) nogroup
+#endif
 		for (jj=0;jj<nroot;jj++){
-			k=rootsups[jj];	
+			k=rootsups[jj];
 
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif	
+#endif
 
 			rtemp_loc = &rtemp[sizertemp* thread_id];
 
 
-			
+
 			knsupc = SuperSize( k );
-			lk = LBi( k, grid ); /* Local block number, row-wise. */		
+			lk = LBi( k, grid ); /* Local block number, row-wise. */
 
 			// bmod[lk] = -1;       /* Do not solve X[k] in the future. */
 			ii = X_BLK( lk );
@@ -2007,22 +2008,22 @@ if(procs==1){
 				zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
 						&alpha, Uinv, &knsupc, &x[ii],
 						&knsupc, &beta, rtemp_loc, &knsupc );
-#endif			   
+#endif
 				#ifdef _OPENMP
 					#pragma omp simd
 				#endif
 				for (i=0 ; i<knsupc*nrhs ; i++){
 					z_copy(&x[ii+i],&rtemp_loc[i]);
-				}		
+				}
 			}else{
 #ifdef _CRAY
 				CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 						lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-				ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-						lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);	
+				ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+						lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-				ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+				ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 						lusup, &nsupr, &x[ii], &knsupc);
 #endif
 			}
@@ -2039,7 +2040,7 @@ if(procs==1){
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat_loc[thread_id]->utime[SOL_TRSM] += t2;
-#endif	
+#endif
 			stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
 			+ 10 * knsupc * nrhs; /* complex division */
 
@@ -2051,46 +2052,46 @@ if(procs==1){
 			 * Send Xk to process column Pc[k].
 			 */
 
-			if(UBtree_ptr[lk]!=NULL){ 
+			if(UBtree_ptr[lk]!=NULL){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 				nroot_send_tmp = ++nroot_send;
 				root_send[(nroot_send_tmp-1)*aln_i] = lk;
-				
+
 			}
 		} /* for k ... */
 	}
 }
 
-		
+
 #ifdef _OPENMP
-#pragma omp parallel default (shared) 
+#pragma omp parallel default (shared)
 #endif
-	{			
+	{
 #ifdef _OPENMP
 #pragma omp master
 #endif
 		{
 #ifdef _OPENMP
-#pragma	omp	taskloop private (ii,jj,k,lk) nogroup		
-#endif		
+#pragma	omp	taskloop private (ii,jj,k,lk) nogroup
+#endif
 		for (jj=0;jj<nroot;jj++){
-			k=rootsups[jj];	
-			lk = LBi( k, grid ); /* Local block number, row-wise. */		
+			k=rootsups[jj];
+			lk = LBi( k, grid ); /* Local block number, row-wise. */
 			ii = X_BLK( lk );
 			lk = LBj( k, grid ); /* Local block number, column-wise */
 
 			/*
 			 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
 			 */
-			if ( Urbs[lk] ) 
-				zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,Urbs2, 
+			if ( Urbs[lk] )
+				zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,
 						Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-						send_req, stat_loc, root_send, &nroot_send, sizelsum,sizertemp,thread_id,num_thread);
-									
+						stat_loc, root_send, &nroot_send, sizelsum,sizertemp,thread_id,num_thread);
+
 		} /* for k ... */
-		
+
 	}
 }
 
@@ -2099,7 +2100,7 @@ for (i=0;i<nroot_send;i++){
 	if(lk>=0){ // this is a bcast forwarding
 		gb = mycol+lk*grid->npcol;  /* not sure */
 		lib = LBi( gb, grid ); /* Local block number, row-wise. */
-		ii = X_BLK( lib );			
+		ii = X_BLK( lib );
 		BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'z')*nrhs+XK_H,'z');
 	}else{ // this is a reduce forwarding
 		lk = -lk - 1;
@@ -2114,38 +2115,38 @@ for (i=0;i<nroot_send;i++){
 		 */
 
 #ifdef _OPENMP
-#pragma omp parallel default (shared) 
+#pragma omp parallel default (shared)
 #endif
-	{	
+	{
 #ifdef _OPENMP
-#pragma omp master 
-#endif		 
+#pragma omp master
+#endif
 		for ( nbrecv =0; nbrecv<nbrecvx+nbrecvmod;nbrecv++) { /* While not finished. */
 
 			// printf("iam %4d nbrecv %4d nbrecvx %4d nbrecvmod %4d\n", iam, nbrecv, nbrecvxnbrecvmod);
-			// fflush(stdout);			
-			
-			
-			
+			// fflush(stdout);
+
+
+
 			thread_id = 0;
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif	
+#endif
 
 			recvbuf0 = &recvbuf_BC_fwd[nbrecvx_buf*maxrecvsz];
 
 			/* Receive a message. */
 			MPI_Recv( recvbuf0, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-					MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );	 	
+					MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
 
-#if ( PROFlevel>=1 )		 
+#if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat_loc[thread_id]->utime[SOL_COMM] += t2;
 
 			msg_cnt += 1;
-			msg_vol += maxrecvsz * dword;			
-#endif	
-		 
+			msg_vol += maxrecvsz * dword;
+#endif
+
 			k = (*recvbuf0).r;
 #if ( DEBUGlevel>=2 )
 			printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
@@ -2155,69 +2156,69 @@ for (i=0;i<nroot_send;i++){
 			if(status.MPI_TAG==BC_U){
 				// --nfrecvx;
 				nbrecvx_buf++;
-				
+
 				lk = LBj( k, grid );    /* local block number */
 
 				if(BcTree_getDestCount(UBtree_ptr[lk],'z')>0){
 
-					BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(UBtree_ptr[lk],'z')*nrhs+XK_H,'z');	
+					BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(UBtree_ptr[lk],'z')*nrhs+XK_H,'z');
 					// nfrecvx_buf++;
 				}
 
 				/*
 				 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-				 */	  
+				 */
 
 				lk = LBj( k, grid ); /* Local block number, column-wise. */
-				zlsum_bmod_inv_master(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, k, bmod, Urbs,Urbs2,
-						Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
-						send_req, stat_loc, sizelsum,sizertemp,thread_id,num_thread);
+				zlsum_bmod_inv_master(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, k, bmod, Urbs,
+						Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+						stat_loc, sizelsum,sizertemp,thread_id,num_thread);
 			}else if(status.MPI_TAG==RD_U){
 
 				lk = LBi( k, grid ); /* Local block number, row-wise. */
-				
+
 				knsupc = SuperSize( k );
 				tempv = &recvbuf0[LSUM_H];
-				il = LSUM_BLK( lk );		  
+				il = LSUM_BLK( lk );
 				RHS_ITERATE(j) {
 					#ifdef _OPENMP
 						#pragma omp simd
-					#endif				
+					#endif
 					for (i = 0; i < knsupc; ++i)
 						z_add(&lsum[i + il + j*knsupc + thread_id*sizelsum],
 							  &lsum[i + il + j*knsupc + thread_id*sizelsum],
 							  &tempv[i + j*knsupc]);
-							
-				}					
+
+				}
 			// #ifdef _OPENMP
 			// #pragma omp atomic capture
 			// #endif
 				bmod_tmp=--bmod[lk*aln_i];
-				thread_id = 0;									
+				thread_id = 0;
 				rtemp_loc = &rtemp[sizertemp* thread_id];
 				if ( bmod_tmp==0 ) {
-					if(RdTree_IsRoot(URtree_ptr[lk],'z')==YES){							
-						
+					if(RdTree_IsRoot(URtree_ptr[lk],'z')==YES){
+
 						knsupc = SuperSize( k );
 						for (ii=1;ii<num_thread;ii++)
 							#ifdef _OPENMP
 								#pragma omp simd
-							#endif							
+							#endif
 							for (jj=0;jj<knsupc*nrhs;jj++)
 								z_add(&lsum[il+ jj ],
 									  &lsum[il+ jj ],
 									  &lsum[il + jj + ii*sizelsum]);
-								
+
 						ii = X_BLK( lk );
 						RHS_ITERATE(j)
 							#ifdef _OPENMP
 								#pragma omp simd
-							#endif							
-							for (i = 0; i < knsupc; ++i)	
+							#endif
+							for (i = 0; i < knsupc; ++i)
 								z_add(&x[i + ii + j*knsupc],
 									  &x[i + ii + j*knsupc],
 									  &lsum[i + il + j*knsupc] );
-					
+
 						lk = LBj( k, grid ); /* Local block number, column-wise. */
 						lsub = Lrowind_bc_ptr[lk];
 						lusup = Lnzval_bc_ptr[lk];
@@ -2239,23 +2240,23 @@ for (i=0;i<nroot_send;i++){
 							zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
 									&alpha, Uinv, &knsupc, &x[ii],
 									&knsupc, &beta, rtemp_loc, &knsupc );
-#endif		
+#endif
 
 							#ifdef _OPENMP
 								#pragma omp simd
 							#endif
 							for (i=0 ; i<knsupc*nrhs ; i++){
 								z_copy(&x[ii+i],&rtemp_loc[i]);
-							}		
+							}
 						}else{
 #ifdef _CRAY
 							CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
 									lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-							ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-									lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);		
+							ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+									lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-							ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
+							ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
 									lusup, &nsupr, &x[ii], &knsupc);
 #endif
 						}
@@ -2263,47 +2264,47 @@ for (i=0;i<nroot_send;i++){
 #if ( PROFlevel>=1 )
 							TOC(t2, t1);
 							stat_loc[thread_id]->utime[SOL_TRSM] += t2;
-#endif	
+#endif
 							stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
 							+ 10 * knsupc * nrhs; /* complex division */
-		
+
 #if ( DEBUGlevel>=2 )
 						printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
 
 						/*
 						 * Send Xk to process column Pc[k].
-						 */						
-						if(UBtree_ptr[lk]!=NULL){ 
+						 */
+						if(UBtree_ptr[lk]!=NULL){
 							BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'z')*nrhs+XK_H,'z');
-						}							
-						
+						}
+
 
 						/*
-						 * Perform local block modifications: 
+						 * Perform local block modifications:
 						 *         lsum[i] -= U_i,k * X[k]
 						 */
 						if ( Urbs[lk] )
-							zlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,Urbs2,
+							zlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,
 									Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-									send_req, stat_loc, sizelsum,sizertemp,thread_id,num_thread);
+									stat_loc, sizelsum,sizertemp,thread_id,num_thread);
 
 					}else{
-						il = LSUM_BLK( lk );		  
+						il = LSUM_BLK( lk );
 						knsupc = SuperSize( k );
 
 						for (ii=1;ii<num_thread;ii++)
 							#ifdef _OPENMP
 								#pragma omp simd
-							#endif						
+							#endif
 							for (jj=0;jj<knsupc*nrhs;jj++)
 								z_add(&lsum[il+ jj ],
 									  &lsum[il+ jj ],
 									  &lsum[il + jj + ii*sizelsum]);
-												
-						RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il-LSUM_H],RdTree_GetMsgSize(URtree_ptr[lk],'z')*nrhs+LSUM_H,'z'); 
-					}						
-				
+
+						RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il-LSUM_H],RdTree_GetMsgSize(URtree_ptr[lk],'z')*nrhs+LSUM_H,'z');
+					}
+
 				}
 			}
 		} /* while not finished ... */
@@ -2315,10 +2316,10 @@ for (i=0;i<nroot_send;i++){
 		MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE,
 				MPI_MAX, 0, grid->comm);
 		if ( !iam ) {
-			printf(".. U-solve time (MAX) \t%8.4f\n", tmax);	
+			printf(".. U-solve time (MAX) \t%8.4f\n", tmax);
 			fflush(stdout);
-		}			
-		t = SuperLU_timer_();			
+		}
+		t = SuperLU_timer_();
 #endif
 
 
@@ -2354,14 +2355,14 @@ for (i=0;i<nroot_send;i++){
 				ScalePermstruct, Glu_persist, grid, SOLVEstruct);
 
 
-#if ( PRNTlevel>=1 )
+#if ( PRNTlevel>=2 )
 		t = SuperLU_timer_() - t;
 		if ( !iam) printf(".. X to B redistribute time\t%8.4f\n", t);
 		t = SuperLU_timer_();
-#endif	
+#endif
 
 
-		double tmp1=0; 
+		double tmp1=0;
 		double tmp2=0;
 		double tmp3=0;
 		double tmp4=0;
@@ -2372,14 +2373,14 @@ for (i=0;i<nroot_send;i++){
 			tmp4 += stat_loc[i]->ops[SOLVE];
 #if ( PRNTlevel>=2 )
 			if(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]);
-#endif	
+#endif
 		}
 
 
 		stat->utime[SOL_TRSM] += tmp1;
 		stat->utime[SOL_GEMM] += tmp2;
 		stat->utime[SOL_COMM] += tmp3;
-		stat->ops[SOLVE]+= tmp4;	  
+		stat->ops[SOLVE]+= tmp4;
 
 
 		/* Deallocate storage. */
@@ -2387,40 +2388,38 @@ for (i=0;i<nroot_send;i++){
 			PStatFree(stat_loc[i]);
 			SUPERLU_FREE(stat_loc[i]);
 		}
-		SUPERLU_FREE(stat_loc);		
+		SUPERLU_FREE(stat_loc);
 		SUPERLU_FREE(rtemp);
 		SUPERLU_FREE(lsum);
 		SUPERLU_FREE(x);
-		
-		
+
+
 		SUPERLU_FREE(bmod);
 		SUPERLU_FREE(brecv);
 		SUPERLU_FREE(root_send);
-		
+
 		SUPERLU_FREE(rootsups);
-		SUPERLU_FREE(recvbuf_BC_fwd);		
+		SUPERLU_FREE(recvbuf_BC_fwd);
+
+		log_memory(-nlb*aln_i*iword-nlb*iword - nsupers_i*iword - (CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword - maxrecvsz*(nbrecvx+1)*dword*2.0 - sizelsum*num_thread * dword*2.0 - (ldalsum * nrhs + nlb * XK_H) *dword*2.0 - (sizertemp*num_thread + 1)*dword*2.0, stat);	//account for bmod, brecv, root_send, rootsups, recvbuf_BC_fwd,rtemp,lsum,x
 
-		log_memory(-nlb*aln_i*iword-nlb*iword - nsupers_i*iword - (CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword - maxrecvsz*(nbrecvx+1)*dword - sizelsum*num_thread * dword - (ldalsum * nrhs + nlb * XK_H) *dword + (sizertemp*num_thread - 1)*dword, stat);	//account for bmod, brecv, root_send, rootsups, recvbuf_BC_fwd,rtemp,lsum,x			
-		
 		for (lk=0;lk<nsupers_j;++lk){
 			if(UBtree_ptr[lk]!=NULL){
-				// if(BcTree_IsRoot(LBtree_ptr[lk],'z')==YES){			
-				BcTree_waitSendRequest(UBtree_ptr[lk],'z');		
+				// if(BcTree_IsRoot(LBtree_ptr[lk],'z')==YES){
+				BcTree_waitSendRequest(UBtree_ptr[lk],'z');
 				// }
 				// deallocate requests here
 			}
 		}
 
 		for (lk=0;lk<nsupers_i;++lk){
-			if(URtree_ptr[lk]!=NULL){		
-				RdTree_waitSendRequest(URtree_ptr[lk],'z');		
+			if(URtree_ptr[lk]!=NULL){
+				RdTree_waitSendRequest(URtree_ptr[lk],'z');
 				// deallocate requests here
 			}
-		}		
+		}
 		MPI_Barrier( grid->comm );
 
-		/*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/
-
 
 #if ( PROFlevel>=2 )
 		{
@@ -2442,7 +2441,7 @@ for (i=0;i<nroot_send;i++){
 						msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
 			}
 		}
-#endif	
+#endif
 
     stat->utime[SOLVE] = SuperLU_timer_() - t1_sol;
 
@@ -2451,10 +2450,10 @@ for (i=0;i<nroot_send;i++){
 #endif
 
 
-#if ( PRNTlevel>=2 )	
+#if ( PRNTlevel>=2 )
 	    float for_lu, total, max, avg, temp;
 		superlu_dist_mem_usage_t num_mem_usage;
-		
+
 	    dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
 	    temp = num_mem_usage.total;
 
@@ -2466,13 +2465,13 @@ for (i=0;i<nroot_send;i++){
 		printf("\n** Memory Usage **********************************\n");
                 printf("** Total highmark (MB):\n"
 		       "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
-		       avg * 1e-6,  
+		       avg * 1e-6,
 		       avg / grid->nprow / grid->npcol * 1e-6,
 		       max * 1e-6);
 		printf("**************************************************\n");
 		fflush(stdout);
             }
-#endif	
+#endif
 
 
     return;
diff -pruN 6.1.0+dfsg1-1/SRC/pzgstrs_lsum.c 6.1.1+dfsg1-1/SRC/pzgstrs_lsum.c
--- 6.1.0+dfsg1-1/SRC/pzgstrs_lsum.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzgstrs_lsum.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,25 +1,26 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k]
  *
  * <pre>
- * -- Distributed SuperLU routine (version 2.0) --
+ * -- Distributed SuperLU routine (version 6.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
  * Modified:
  *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
  *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ * February 8, 2019  version 6.1.1
  * </pre>
  */
 
@@ -28,7 +29,7 @@ at the top-level directory.
 
 #ifndef CACHELINE
 #define CACHELINE 64  /* bytes, Xeon Phi KNL, Cori haswell, Edision */
-#endif	
+#endif
 
 #define ISEND_IRECV
 
@@ -38,7 +39,7 @@ at the top-level directory.
 #ifdef _CRAY
 fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
 		   doublecomplex*, int*, doublecomplex*, int*);
-fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
+fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*,
 		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
 _fcd ftcs1;
 _fcd ftcs2;
@@ -90,11 +91,11 @@ void zlsum_fmod
 #if ( PROFlevel>=1 )
 	double t1, t2;
 	float msg_vol = 0, msg_cnt = 0;
-#endif 
+#endif
 #if ( PROFlevel>=1 )
 	TIC(t1);
-#endif	
-	
+#endif
+
     iam = grid->iam;
     myrow = MYROW( iam, grid );
     lk = LBj( k, grid ); /* Local block number, column-wise. */
@@ -119,7 +120,7 @@ void zlsum_fmod
 	       &knsupc, &beta, rtemp, &nbrow );
 #endif
 	stat->ops[SOLVE] += 8 * nbrow * nrhs * knsupc + 2 * nbrow * nrhs;
-   
+
 	lk = LBi( ik, grid ); /* Local block number, row-wise. */
 	iknsupc = SuperSize( ik );
 	il = LSUM_BLK( lk );
@@ -138,8 +139,8 @@ void zlsum_fmod
 #if ( PROFlevel>=1 )
 		TOC(t2, t1);
 		stat->utime[SOL_GEMM] += t2;
-#endif		
-	
+#endif
+
 	if ( (--fmod[lk])==0 ) { /* Local accumulation done. */
 	    ikcol = PCOL( ik, grid );
 	    p = PNUM( myrow, ikcol, grid );
@@ -176,28 +177,28 @@ void zlsum_fmod
 		    nsupr1 = lsub1[1];
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif				
+#endif
 #ifdef _CRAY
 		    CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 			  lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-		    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+		    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 			   lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-		    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+		    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 			   lusup1, &nsupr1, &x[ii], &iknsupc);
 #endif
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat->utime[SOL_TRSM] += t2;
-#endif	
+#endif
 
 		    stat->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
 			+ 10 * knsupc * nrhs; /* complex division */
 #if ( DEBUGlevel>=2 )
 		    printf("(%2d) Solve X[%2d]\n", iam, ik);
 #endif
-		
+
 		    /*
 		     * Send Xk to process column Pc[k].
 		     */
@@ -357,10 +358,10 @@ void zlsum_bmod
 		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
 			  lusup, &nsupr, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-		    ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+		    ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-		    ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+		    ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &iknsupc);
 #endif
 		    stat->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs
@@ -441,13 +442,13 @@ void zlsum_fmod_inv
  int_t recurlevel,
  int_t maxsuper,
  int thread_id,
- int num_thread 
+ int num_thread
 )
 {
     doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0},malpha={-1.0, 0.0};
     doublecomplex *lusup, *lusup1;
     doublecomplex *dest;
-	doublecomplex *Linv;/* Inverse of diagonal block */    	
+	doublecomplex *Linv;/* Inverse of diagonal block */
 	int    iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r,m;
 	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
 	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
@@ -456,7 +457,7 @@ void zlsum_fmod_inv
     int_t  **fsendx_plist = Llu->fsendx_plist;
 	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
 	int thread_id1;
-	flops_t ops_loc=0.0;    	
+	flops_t ops_loc=0.0;
     MPI_Status status;
     int test_flag;
 	yes_no_t done;
@@ -468,22 +469,22 @@ void zlsum_fmod_inv
 	int_t nleaf_send_tmp;
 	int_t lptr;      /* Starting position in lsub[*].                      */
 	int_t luptr;     /* Starting position in lusup[*].                     */
-	int_t iword = sizeof(int_t);	
-	int_t dword = sizeof (double);		
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (double);
 	int_t aln_d,aln_i;
 	aln_d = ceil(CACHELINE/(double)dword);
 	aln_i = ceil(CACHELINE/(double)iword);
 	int   knsupc;    /* Size of supernode k.                               */
 	int_t nlb;       /* Number of L blocks.                                */
-	
-	
+
+
 	knsupc = SuperSize( k );
-	
+
 	lk = LBj( k, grid ); /* Local block number, column-wise. */
 	lsub = Llu->Lrowind_bc_ptr[lk];
 	nlb = lsub[0] - 1;
-	
-	
+
+
 	ldalsum=Llu->ldalsum;
 
 	rtemp_loc = &rtemp[sizertemp* thread_id];
@@ -491,7 +492,7 @@ void zlsum_fmod_inv
 	// #if ( PROFlevel>=1 )
 	double t1, t2, t3, t4;
 	float msg_vol = 0, msg_cnt = 0;
-	// #endif 
+	// #endif
 
 	if(nlb>0){
 
@@ -522,9 +523,9 @@ void zlsum_fmod_inv
 		}
 
 		assert(m>0);
-				
-		if(m>8*maxsuper){ 
-		// if(0){ 
+
+		if(m>8*maxsuper){
+		// if(0){
 
 			// Nchunk=floor(num_thread/2.0)+1;
 			Nchunk=SUPERLU_MIN(num_thread,nlb);
@@ -533,15 +534,15 @@ void zlsum_fmod_inv
 			remainder = nlb % Nchunk;
 
 #ifdef _OPENMP
-#pragma	omp	taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j,nleaf_send_tmp) untied nogroup	
-#endif	
+#pragma	omp	taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j,nleaf_send_tmp) untied nogroup
+#endif
 			for (nn=0;nn<Nchunk;++nn){
 
-#ifdef _OPENMP				 
+#ifdef _OPENMP
 				thread_id1 = omp_get_thread_num ();
 #else
 				thread_id1 = 0;
-#endif		
+#endif
 				rtemp_loc = &rtemp[sizertemp* thread_id1];
 
 				if(nn<remainder){
@@ -556,14 +557,14 @@ void zlsum_fmod_inv
 
 #if ( PROFlevel>=1 )
 					TIC(t1);
-#endif				
+#endif
 					luptr_tmp1 = lloc[lbstart+idx_v];
 					nbrow=0;
-					for (lb = lbstart; lb < lbend; ++lb){ 		
-						lptr1_tmp = lloc[lb+idx_i];		
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
 						nbrow += lsub[lptr1_tmp+1];
 					}
-					
+
 				#ifdef _CRAY
 					CGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
 						  &alpha, &lusup[luptr_tmp1], &nsupr, xk,
@@ -579,22 +580,22 @@ void zlsum_fmod_inv
 				#endif
 
 					nbrow_ref=0;
-					for (lb = lbstart; lb < lbend; ++lb){ 		
-						lptr1_tmp = lloc[lb+idx_i];	
-						lptr= lptr1_tmp+2;	
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
+						lptr= lptr1_tmp+2;
 						nbrow1 = lsub[lptr1_tmp+1];
 						ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
 						rel = xsup[ik]; /* Global row index of block ik. */
-	
-						lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+
+						lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 						iknsupc = SuperSize( ik );
 						il = LSUM_BLK( lk );
 
 						RHS_ITERATE(j)
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif						
+							#pragma omp simd
+							#endif
 							for (i = 0; i < nbrow1; ++i) {
 								irow = lsub[lptr+i] - rel; /* Relative row. */
 								z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id1],
@@ -607,7 +608,7 @@ void zlsum_fmod_inv
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
 					stat[thread_id1]->utime[SOL_GEMM] += t2;
-#endif	
+#endif
 
 					for (lb=lbstart;lb<lbend;lb++){
 						lk = lloc[lb+idx_n];
@@ -618,10 +619,10 @@ void zlsum_fmod_inv
 
 						if ( fmod_tmp==0 ) { /* Local accumulation done. */
 
-							lptr1_tmp = lloc[lb+idx_i];	
+							lptr1_tmp = lloc[lb+idx_i];
 
 							ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-							lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+							lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 							iknsupc = SuperSize( ik );
 							il = LSUM_BLK( lk );
@@ -631,28 +632,28 @@ void zlsum_fmod_inv
 							if ( iam != p ) {
 								for (ii=1;ii<num_thread;ii++)
 									#ifdef _OPENMP
-									#pragma omp simd							
+									#pragma omp simd
 									#endif
 									for (jj=0;jj<iknsupc*nrhs;jj++)
 										z_add(&lsum[il + jj ],
 											  &lsum[il + jj ],
 											  &lsum[il + jj + ii*sizelsum]);
-								
+
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 								nleaf_send_tmp = ++nleaf_send[0];
-								leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;	
+								leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
 								// RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],'z');
 
 							} else { /* Diagonal process: X[i] += lsum[i]. */
 
 #if ( PROFlevel>=1 )
 								TIC(t1);
-#endif		
+#endif
 								for (ii=1;ii<num_thread;ii++)
 									#ifdef _OPENMP
-									#pragma omp simd							
+									#pragma omp simd
 									#endif
 									for (jj=0;jj<iknsupc*nrhs;jj++)
 										z_add(&lsum[il + jj ],
@@ -662,13 +663,13 @@ void zlsum_fmod_inv
 								ii = X_BLK( lk );
 								RHS_ITERATE(j)
 									#ifdef _OPENMP
-									#pragma omp simd							
-									#endif								
-									for (i = 0; i < iknsupc; ++i)	
+									#pragma omp simd
+									#endif
+									for (i = 0; i < iknsupc; ++i)
 										z_add(&x[i + ii + j*iknsupc],
 											  &x[i + ii + j*iknsupc],
 											  &lsum[i + il + j*iknsupc] );
-										
+
 
 								// fmod[lk] = -1; /* Do not solve X[k] in the future. */
 								lk = LBj( ik, grid );/* Local block number, column-wise. */
@@ -678,8 +679,8 @@ void zlsum_fmod_inv
 
 								if(Llu->inv == 1){
 									Linv = Llu->Linv_bc_ptr[lk];
-									
-									
+
+
 #ifdef _CRAY
 									CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
 											&alpha, Linv, &iknsupc, &x[ii],
@@ -692,25 +693,25 @@ void zlsum_fmod_inv
 									zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
 											&alpha, Linv, &iknsupc, &x[ii],
 											&iknsupc, &beta, rtemp_loc, &iknsupc );
-#endif 
+#endif
 									#ifdef _OPENMP
-									#pragma omp simd							
-									#endif 
+									#pragma omp simd
+									#endif
 									for (i=0 ; i<iknsupc*nrhs ; i++){
 										z_copy(&x[ii+i],&rtemp_loc[i]);
 									}
-									
+
 								}else{
 #ifdef _CRAY
 									CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 											lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-									ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
-											lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);		   
+									ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+											lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-									ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+									ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 											lusup1, &nsupr1, &x[ii], &iknsupc);
-	  
+
 #endif
 								}
 								// for (i=0 ; i<iknsupc*nrhs ; i++){
@@ -722,14 +723,14 @@ void zlsum_fmod_inv
 								TOC(t2, t1);
 								stat[thread_id1]->utime[SOL_TRSM] += t2;
 
-#endif	
-								
+#endif
+
 								stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
 								+ 10 * knsupc * nrhs; /* complex division */
-								
+
 #if ( DEBUGlevel>=2 )
 								printf("(%2d) Solve X[%2d]\n", iam, ik);
-													
+
 #endif
 
 								/*
@@ -749,28 +750,28 @@ void zlsum_fmod_inv
 								 */
 
 								// #ifdef _OPENMP
-								// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) 	
+								// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
 								// #endif
 								{
-					
+
 									zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
 											fmod, xsup,
 											grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread);
-								}		   
+								}
 
 								// } /* if frecv[lk] == 0 */
 						} /* if iam == p */
-					} /* if fmod[lk] == 0 */				
+					} /* if fmod[lk] == 0 */
 				}
 
 			}
 		}
 
-		}else{ 
+		}else{
 
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif	
+#endif
 
 #ifdef _CRAY
 			CGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc,
@@ -784,30 +785,30 @@ void zlsum_fmod_inv
 			zgemm_( "N", "N", &m, &nrhs, &knsupc,
 					&alpha, &lusup[luptr_tmp], &nsupr, xk,
 					&knsupc, &beta, rtemp_loc, &m );
-#endif   	
-			
+#endif
+
 			nbrow=0;
-			for (lb = 0; lb < nlb; ++lb){ 		
-				lptr1_tmp = lloc[lb+idx_i];		
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
 				nbrow += lsub[lptr1_tmp+1];
-			}			
+			}
 			nbrow_ref=0;
-			for (lb = 0; lb < nlb; ++lb){ 		
-				lptr1_tmp = lloc[lb+idx_i];	
-				lptr= lptr1_tmp+2;	
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
+				lptr= lptr1_tmp+2;
 				nbrow1 = lsub[lptr1_tmp+1];
 				ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
 				rel = xsup[ik]; /* Global row index of block ik. */
 
-				lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+				lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 				iknsupc = SuperSize( ik );
 				il = LSUM_BLK( lk );
 
 				RHS_ITERATE(j)
 					#ifdef _OPENMP
-					#pragma omp simd							
-					#endif					
+					#pragma omp simd
+					#endif
 					for (i = 0; i < nbrow1; ++i) {
 						irow = lsub[lptr+i] - rel; /* Relative row. */
 
@@ -816,14 +817,14 @@ void zlsum_fmod_inv
 									  &rtemp_loc[nbrow_ref+i + j*nbrow]);
 					}
 				nbrow_ref+=nbrow1;
-			}			
-			
+			}
+
 			// TOC(t3, t1);
 
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat[thread_id]->utime[SOL_GEMM] += t2;
-#endif		
+#endif
 
 			for (lb=0;lb<nlb;lb++){
 				lk = lloc[lb+idx_n];
@@ -836,10 +837,10 @@ void zlsum_fmod_inv
 
 				if ( fmod_tmp==0 ) { /* Local accumulation done. */
 
-					lptr1_tmp = lloc[lb+idx_i];	
+					lptr1_tmp = lloc[lb+idx_i];
 
 					ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-					lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+					lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 					iknsupc = SuperSize( ik );
 					il = LSUM_BLK( lk );
@@ -848,43 +849,43 @@ void zlsum_fmod_inv
 					if ( iam != p ) {
 						for (ii=1;ii<num_thread;ii++)
 							#ifdef _OPENMP
-							#pragma omp simd							
+							#pragma omp simd
 							#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								z_add(&lsum[il + jj ],
 									  &lsum[il + jj ],
 									  &lsum[il + jj + ii*sizelsum]);
-								
+
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 						nleaf_send_tmp = ++nleaf_send[0];
-						leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;						
+						leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
 
 					} else { /* Diagonal process: X[i] += lsum[i]. */
 
 #if ( PROFlevel>=1 )
 						TIC(t1);
-#endif		
+#endif
 						for (ii=1;ii<num_thread;ii++)
 							#ifdef _OPENMP
-							#pragma omp simd							
+							#pragma omp simd
 							#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								z_add(&lsum[il + jj ],
 									  &lsum[il + jj ],
 									  &lsum[il + jj + ii*sizelsum]);
-					
+
 						ii = X_BLK( lk );
 						RHS_ITERATE(j)
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif	
-							for (i = 0; i < iknsupc; ++i)	
+							#pragma omp simd
+							#endif
+							for (i = 0; i < iknsupc; ++i)
 								z_add(&x[i + ii + j*iknsupc],
 									  &x[i + ii + j*iknsupc],
 									  &lsum[i + il + j*iknsupc] );
-								
+
 
 						lk = LBj( ik, grid );/* Local block number, column-wise. */
 						lsub1 = Llu->Lrowind_bc_ptr[lk];
@@ -907,34 +908,34 @@ void zlsum_fmod_inv
 									&iknsupc, &beta, rtemp_loc, &iknsupc );
 #endif
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif   
+							#pragma omp simd
+							#endif
 							for (i=0 ; i<iknsupc*nrhs ; i++){
 								z_copy(&x[ii+i],&rtemp_loc[i]);
-							}		
+							}
 						}else{
 #ifdef _CRAY
 							CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 									lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-							ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
-									lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);		   
+							ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+									lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-							ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+							ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 									lusup1, &nsupr1, &x[ii], &iknsupc);
 #endif
 						}
-						
+
 							// for (i=0 ; i<iknsupc*nrhs ; i++){
 							// printf("x_lsum: %f %f\n",x[ii+i].r,x[ii+i].i);
 							// fflush(stdout);
 							// }
-						
+
 
 #if ( PROFlevel>=1 )
 						TOC(t2, t1);
 						stat[thread_id]->utime[SOL_TRSM] += t2;
-#endif	
+#endif
 
 						stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
 						+ 10 * knsupc * nrhs; /* complex division */
@@ -963,25 +964,25 @@ void zlsum_fmod_inv
 						 */
 
 						// #ifdef _OPENMP
-						// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,send_req,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) 	
+						// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1)
 						// #endif
 
 						{
 							zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
 									fmod, xsup,
 									grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
-						}		   
+						}
 
 						// } /* if frecv[lk] == 0 */
 				} /* if iam == p */
-			} /* if fmod[lk] == 0 */				
+			} /* if fmod[lk] == 0 */
 		}
 		// }
 }
 
 	stat[thread_id]->ops[SOLVE] += 8 * m * nrhs * knsupc;
 
-	
+
 
 } /* if nlb>0*/
 } /* zLSUM_FMOD_INV */
@@ -1022,7 +1023,7 @@ void zlsum_fmod_inv_master
     doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0},malpha={-1.0, 0.0};
     doublecomplex *lusup, *lusup1;
     doublecomplex *dest;
-	doublecomplex *Linv;/* Inverse of diagonal block */    	
+	doublecomplex *Linv;/* Inverse of diagonal block */
 	int    iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r;
 	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
 	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
@@ -1031,8 +1032,8 @@ void zlsum_fmod_inv_master
     int_t  **fsendx_plist = Llu->fsendx_plist;
 	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
 	int thread_id1;
-	int m;	
-	flops_t ops_loc=0.0;    	
+	int m;
+	flops_t ops_loc=0.0;
     MPI_Status status;
     int test_flag;
 	yes_no_t done;
@@ -1040,12 +1041,12 @@ void zlsum_fmod_inv_master
 	RdTree  *LRtree_ptr = Llu->LRtree_ptr;
 	int_t* idx_lsum,idx_lsum1;
 	doublecomplex *rtemp_loc;
-	int_t ldalsum;	
+	int_t ldalsum;
 	int_t nleaf_send_tmp;
 	int_t lptr;      /* Starting position in lsub[*].                      */
 	int_t luptr;     /* Starting position in lusup[*].                     */
-	int_t iword = sizeof(int_t);	
-	int_t dword = sizeof (double);		
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (double);
 	int_t aln_d,aln_i;
 	aln_d = ceil(CACHELINE/(double)dword);
 	aln_i = ceil(CACHELINE/(double)iword);
@@ -1057,7 +1058,7 @@ void zlsum_fmod_inv_master
 	// #if ( PROFlevel>=1 )
 	double t1, t2, t3, t4;
 	float msg_vol = 0, msg_cnt = 0;
-	// #endif 
+	// #endif
 
 	if(nlb>0){
 
@@ -1066,12 +1067,12 @@ void zlsum_fmod_inv_master
 		lk = LBj( k, grid ); /* Local block number, column-wise. */
 
 		// printf("ya1 %5d k %5d lk %5d\n",thread_id,k,lk);
-		// fflush(stdout);	
+		// fflush(stdout);
 
 		lsub = Llu->Lrowind_bc_ptr[lk];
 
 		// printf("ya2 %5d k %5d lk %5d\n",thread_id,k,lk);
-		// fflush(stdout);	
+		// fflush(stdout);
 
 		lusup = Llu->Lnzval_bc_ptr[lk];
 		lloc = Llu->Lindval_loc_bc_ptr[lk];
@@ -1098,8 +1099,8 @@ void zlsum_fmod_inv_master
 		}
 
 		assert(m>0);
-				
-		if(m>4*maxsuper || nrhs>10){ 
+
+		if(m>4*maxsuper || nrhs>10){
 			// if(m<1){
 			// TIC(t1);
 			Nchunk=num_thread;
@@ -1108,14 +1109,14 @@ void zlsum_fmod_inv_master
 
 #ifdef _OPENMP
 #pragma	omp	taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j) untied
-#endif	
+#endif
 			for (nn=0;nn<Nchunk;++nn){
 
-#ifdef _OPENMP				 
+#ifdef _OPENMP
 				thread_id1 = omp_get_thread_num ();
 #else
 				thread_id1 = 0;
-#endif		
+#endif
 				rtemp_loc = &rtemp[sizertemp* thread_id1];
 
 				if(nn<remainder){
@@ -1130,14 +1131,14 @@ void zlsum_fmod_inv_master
 
 #if ( PROFlevel>=1 )
 					TIC(t1);
-#endif				
+#endif
 					luptr_tmp1 = lloc[lbstart+idx_v];
 					nbrow=0;
-					for (lb = lbstart; lb < lbend; ++lb){ 		
-						lptr1_tmp = lloc[lb+idx_i];		
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
 						nbrow += lsub[lptr1_tmp+1];
 					}
-					
+
 				#ifdef _CRAY
 					CGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
 						  &alpha, &lusup[luptr_tmp1], &nsupr, xk,
@@ -1153,22 +1154,22 @@ void zlsum_fmod_inv_master
 				#endif
 
 					nbrow_ref=0;
-					for (lb = lbstart; lb < lbend; ++lb){ 		
-						lptr1_tmp = lloc[lb+idx_i];	
-						lptr= lptr1_tmp+2;	
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
+						lptr= lptr1_tmp+2;
 						nbrow1 = lsub[lptr1_tmp+1];
 						ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
 						rel = xsup[ik]; /* Global row index of block ik. */
-	
-						lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+
+						lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 						iknsupc = SuperSize( ik );
 						il = LSUM_BLK( lk );
 
-						RHS_ITERATE(j)	
-							#ifdef _OPENMP	
+						RHS_ITERATE(j)
+							#ifdef _OPENMP
 								#pragma omp simd lastprivate(irow)
-							#endif							
+							#endif
 							for (i = 0; i < nbrow1; ++i) {
 								irow = lsub[lptr+i] - rel; /* Relative row. */
 								z_sub(&lsum[il+irow + j*iknsupc],
@@ -1181,15 +1182,15 @@ void zlsum_fmod_inv_master
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
 					stat[thread_id1]->utime[SOL_GEMM] += t2;
-#endif	
+#endif
 			}
 		}
 
-		}else{ 
+		}else{
 
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif	
+#endif
 
 #ifdef _CRAY
 			CGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc,
@@ -1203,30 +1204,30 @@ void zlsum_fmod_inv_master
 			zgemm_( "N", "N", &m, &nrhs, &knsupc,
 					&alpha, &lusup[luptr_tmp], &nsupr, xk,
 					&knsupc, &beta, rtemp_loc, &m );
-#endif   	
-			
+#endif
+
 			nbrow=0;
-			for (lb = 0; lb < nlb; ++lb){ 		
-				lptr1_tmp = lloc[lb+idx_i];		
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
 				nbrow += lsub[lptr1_tmp+1];
-			}			
+			}
 			nbrow_ref=0;
-			for (lb = 0; lb < nlb; ++lb){ 		
-				lptr1_tmp = lloc[lb+idx_i];	
-				lptr= lptr1_tmp+2;	
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
+				lptr= lptr1_tmp+2;
 				nbrow1 = lsub[lptr1_tmp+1];
 				ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
 				rel = xsup[ik]; /* Global row index of block ik. */
 
-				lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+				lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 				iknsupc = SuperSize( ik );
 				il = LSUM_BLK( lk );
 
 				RHS_ITERATE(j)
-					#ifdef _OPENMP	
+					#ifdef _OPENMP
 						#pragma omp simd lastprivate(irow)
-					#endif					
+					#endif
 					for (i = 0; i < nbrow1; ++i) {
 						irow = lsub[lptr+i] - rel; /* Relative row. */
 
@@ -1235,12 +1236,12 @@ void zlsum_fmod_inv_master
 									  &rtemp_loc[nbrow_ref+i + j*nbrow]);
 					}
 				nbrow_ref+=nbrow1;
-			}			
+			}
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat[thread_id]->utime[SOL_GEMM] += t2;
-#endif	
-		}	
+#endif
+		}
 			// TOC(t3, t1);
 		rtemp_loc = &rtemp[sizertemp* thread_id];
 
@@ -1257,11 +1258,11 @@ void zlsum_fmod_inv_master
 				// --fmod[lk];
 
 
-				lptr1_tmp = lloc[lb+idx_i];	
+				lptr1_tmp = lloc[lb+idx_i];
 				// luptr_tmp = lloc[lb+idx_v];
 
 				ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-				lk = LBi( ik, grid ); /* Local block number, row-wise. */	
+				lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
 				iknsupc = SuperSize( ik );
 				il = LSUM_BLK( lk );
@@ -1276,9 +1277,9 @@ void zlsum_fmod_inv_master
 
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP	
+						#ifdef _OPENMP
 							#pragma omp simd
-						#endif							
+						#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 							z_add(&lsum[il + jj ],
 								  &lsum[il + jj ],
@@ -1296,12 +1297,12 @@ void zlsum_fmod_inv_master
 
 #if ( PROFlevel>=1 )
 					TIC(t1);
-#endif		
+#endif
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP	
+						#ifdef _OPENMP
 							#pragma omp simd
-						#endif						
+						#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 							z_add(&lsum[il + jj ],
 								  &lsum[il + jj ],
@@ -1310,10 +1311,10 @@ void zlsum_fmod_inv_master
 					ii = X_BLK( lk );
 					// for (jj=0;jj<num_thread;jj++)
 					RHS_ITERATE(j)
-						#ifdef _OPENMP	
-							#pragma omp simd 
-						#endif						
-						for (i = 0; i < iknsupc; ++i)	
+						#ifdef _OPENMP
+							#pragma omp simd
+						#endif
+						for (i = 0; i < iknsupc; ++i)
 							z_add(&x[i + ii + j*iknsupc],
 								  &x[i + ii + j*iknsupc],
 								  &lsum[i + il + j*iknsupc] );
@@ -1339,21 +1340,21 @@ void zlsum_fmod_inv_master
 								&alpha, Linv, &iknsupc, &x[ii],
 								&iknsupc, &beta, rtemp_loc, &iknsupc );
 #endif
-						#ifdef _OPENMP	
-							#pragma omp simd 
-						#endif	   
+						#ifdef _OPENMP
+							#pragma omp simd
+						#endif
 						for (i=0 ; i<iknsupc*nrhs ; i++){
 										z_copy(&x[ii+i],&rtemp_loc[i]);
-						}		
+						}
 					}else{
 #ifdef _CRAY
 						CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 								lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-						ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
-								lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);		   
+						ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+								lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-						ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
+						ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 								lusup1, &nsupr1, &x[ii], &iknsupc);
 #endif
 					}
@@ -1366,11 +1367,11 @@ void zlsum_fmod_inv_master
 					TOC(t2, t1);
 					stat[thread_id]->utime[SOL_TRSM] += t2;
 
-#endif	
+#endif
 
 					stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
 					+ 10 * knsupc * nrhs; /* complex division */
-					
+
 #if ( DEBUGlevel>=2 )
 					printf("(%2d) Solve X[%2d]\n", iam, ik);
 #endif
@@ -1387,7 +1388,7 @@ void zlsum_fmod_inv_master
 					 */
 
 					// #ifdef _OPENMP
-					// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,send_req,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) 	
+					// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
 					// #endif
 					{
 						nlb1 = lsub1[0] - 1;
@@ -1396,11 +1397,11 @@ void zlsum_fmod_inv_master
 						zlsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
 								fmod, nlb1, xsup,
 								grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
-					}		   
+					}
 
 					// } /* if frecv[lk] == 0 */
 				} /* if iam == p */
-			} /* if fmod[lk] == 0 */				
+			} /* if fmod[lk] == 0 */
 		}
 		// }
 		stat[thread_id]->ops[SOLVE] += 8 * m * nrhs * knsupc;
@@ -1421,16 +1422,14 @@ void zlsum_bmod_inv
  int_t  k,            /* The k-th component of X.                       */
  int_t  *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
- int_t  *Urbs2,
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
  int_t  *xsup,
  gridinfo_t *grid,
  LocalLU_t *Llu,
- MPI_Request send_req[], /* input/output */
  SuperLUStat_t **stat,
- int_t* root_send, 
- int_t* nroot_send, 
+ int_t* root_send,
+ int_t* nroot_send,
  int_t sizelsum,
  int_t sizertemp,
  int thread_id,
@@ -1454,34 +1453,34 @@ void zlsum_bmod_inv
 	int_t  *brecv = Llu->brecv;
 	int_t  **bsendx_plist = Llu->bsendx_plist;
 	BcTree  *UBtree_ptr = Llu->UBtree_ptr;
-	RdTree  *URtree_ptr = Llu->URtree_ptr;	
+	RdTree  *URtree_ptr = Llu->URtree_ptr;
 	MPI_Status status;
 	int test_flag;
 	int_t bmod_tmp;
 	int thread_id1;
 	doublecomplex *rtemp_loc;
-	int_t nroot_send_tmp;	
-	doublecomplex *Uinv;/* Inverse of diagonal block */    
+	int_t nroot_send_tmp;
+	doublecomplex *Uinv;/* Inverse of diagonal block */
 	doublecomplex temp;
 	double t1, t2;
 	float msg_vol = 0, msg_cnt = 0;
-	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;  
-	int_t iword = sizeof(int_t);	
-	int_t dword = sizeof (double);		
+	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (double);
 	int_t aln_d,aln_i;
 	aln_d = ceil(CACHELINE/(double)dword);
-	aln_i = ceil(CACHELINE/(double)iword);	
+	aln_i = ceil(CACHELINE/(double)iword);
+
 
-	
 	iam = grid->iam;
 	myrow = MYROW( iam, grid );
 	knsupc = SuperSize( k );
 	lk = LBj( k, grid ); /* Local block number, column-wise. */
-	nub = Urbs[lk];      /* Number of U blocks in block column lk */	
-	
+	nub = Urbs[lk];      /* Number of U blocks in block column lk */
+
 	if(Llu->Unnz[lk]>knsupc*64 || nub>16){
 	// if(nub>num_thread){
-	// if(nub>16){ 
+	// if(nub>16){
 	// // // // if(Urbs2[lk]>num_thread){
 	// if(Urbs2[lk]>0){
 		Nchunk=SUPERLU_MIN(num_thread,nub);
@@ -1489,15 +1488,15 @@ void zlsum_bmod_inv
 		remainder = nub % Nchunk;
 		// printf("Unnz: %5d nub: %5d knsupc: %5d\n",Llu->Unnz[lk],nub,knsupc);
 #ifdef _OPENMP
-#pragma	omp	taskloop firstprivate (send_req,stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup	
-#endif	
+#pragma	omp	taskloop firstprivate (stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup
+#endif
 		for (nn=0;nn<Nchunk;++nn){
 
-#ifdef _OPENMP				 
+#ifdef _OPENMP
 			thread_id1 = omp_get_thread_num ();
 #else
 			thread_id1 = 0;
-#endif		
+#endif
 			rtemp_loc = &rtemp[sizertemp* thread_id1];
 
 			if(nn<remainder){
@@ -1506,7 +1505,7 @@ void zlsum_bmod_inv
 			}else{
 				lbstart = remainder+nn*nub_loc;
 				lbend = remainder + (nn+1)*nub_loc;
-			}			
+			}
 			for (ub = lbstart; ub < lbend; ++ub){
 				ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
 				usub = Llu->Ufstnz_br_ptr[ik];
@@ -1521,8 +1520,8 @@ void zlsum_bmod_inv
 
 #if ( PROFlevel>=1 )
 				TIC(t1);
-#endif					
-				
+#endif
+
 				RHS_ITERATE(j) {
 					dest = &lsum[il + j*iknsupc+sizelsum*thread_id1];
 					y = &xk[j*knsupc];
@@ -1532,8 +1531,8 @@ void zlsum_bmod_inv
 						if ( fnz < iklrow ) { /* Nonzero segment. */
 							/* AXPY */
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif	
+							#pragma omp simd
+							#endif
 							for (irow = fnz; irow < iklrow; ++irow)
 								{
 								zz_mult(&temp, &uval[uptr], &y[jj]);
@@ -1542,22 +1541,22 @@ void zlsum_bmod_inv
 								++uptr;
 								}
 								stat[thread_id1]->ops[SOLVE] += 8 * (iklrow - fnz);
-							
+
 						}
 					} /* for jj ... */
 				}
-				
+
 #if ( PROFlevel>=1 )
 				TOC(t2, t1);
 				stat[thread_id1]->utime[SOL_GEMM] += t2;
-#endif					
-				
+#endif
+
 
 		#ifdef _OPENMP
 		#pragma omp atomic capture
-		#endif		
+		#endif
 				bmod_tmp=--bmod[ik*aln_i];
-				
+
 				if ( bmod_tmp == 0 ) { /* Local accumulation done. */
 					gikcol = PCOL( gik, grid );
 					p = PNUM( myrow, gikcol, grid );
@@ -1565,18 +1564,18 @@ void zlsum_bmod_inv
 						for (ii=1;ii<num_thread;ii++)
 							// if(ii!=thread_id1)
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif								
+							#pragma omp simd
+							#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								z_add(&lsum[il + jj ],
 									  &lsum[il + jj ],
 									  &lsum[il + jj + ii*sizelsum]);
-								
+
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 						nroot_send_tmp = ++nroot_send[0];
-						root_send[(nroot_send_tmp-1)*aln_i] = -ik-1;						
+						root_send[(nroot_send_tmp-1)*aln_i] = -ik-1;
 						// RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],'z');
 
 		#if ( DEBUGlevel>=2 )
@@ -1584,16 +1583,16 @@ void zlsum_bmod_inv
 								iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
 		#endif
 					} else { /* Diagonal process: X[i] += lsum[i]. */
-						
+
 #if ( PROFlevel>=1 )
 						TIC(t1);
-#endif								
-						
+#endif
+
 						for (ii=1;ii<num_thread;ii++)
 							// if(ii!=thread_id1)
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif								
+							#pragma omp simd
+							#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								z_add(&lsum[il + jj ],
 									  &lsum[il + jj ],
@@ -1601,16 +1600,16 @@ void zlsum_bmod_inv
 
 						ii = X_BLK( ik );
 						dest = &x[ii];
-								
+
 						RHS_ITERATE(j)
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif							
+							#pragma omp simd
+							#endif
 							for (i = 0; i < iknsupc; ++i)
 								z_add(&dest[i + j*iknsupc],
 									  &dest[i + j*iknsupc],
 									  &lsum[i + il + j*iknsupc]);
-								
+
 						// if ( !brecv[ik] ) { /* Becomes a leaf node. */
 							// bmod[ik] = -1; /* Do not solve X[k] in the future. */
 							lk1 = LBj( gik, grid ); /* Local block number. */
@@ -1619,7 +1618,7 @@ void zlsum_bmod_inv
 							nsupr = lsub[1];
 
 							if(Llu->inv == 1){
-								Uinv = Llu->Uinv_bc_ptr[lk1];  
+								Uinv = Llu->Uinv_bc_ptr[lk1];
 		#ifdef _CRAY
 								CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
 										&alpha, Uinv, &iknsupc, &x[ii],
@@ -1634,20 +1633,20 @@ void zlsum_bmod_inv
 										&iknsupc, &beta, rtemp_loc, &iknsupc );
 		#endif
 								#ifdef _OPENMP
-								#pragma omp simd							
-								#endif			
+								#pragma omp simd
+								#endif
 								for (i=0 ; i<iknsupc*nrhs ; i++){
 									z_copy(&x[ii+i],&rtemp_loc[i]);
-								}		
+								}
 							}else{
 		#ifdef _CRAY
 								CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
 										lusup, &nsupr, &x[ii], &iknsupc);
 		#elif defined (USE_VENDOR_BLAS)
-								ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
-										lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);	
+								ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+										lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
 		#else
-								ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+								ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 										lusup, &nsupr, &x[ii], &iknsupc);
 		#endif
 							}
@@ -1655,14 +1654,14 @@ void zlsum_bmod_inv
 								// printf("x_usum: %f %f\n",x[ii+i].r,x[ii+i].i);
 								// fflush(stdout);
 								// }
-					
+
 		#if ( PROFlevel>=1 )
 							TOC(t2, t1);
 							stat[thread_id1]->utime[SOL_TRSM] += t2;
-		#endif		
+		#endif
 							stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs
 							+ 10 * knsupc * nrhs; /* complex division */
-							
+
 		#if ( DEBUGlevel>=2 )
 							printf("(%2d) Solve X[%2d]\n", iam, gik);
 		#endif
@@ -1675,35 +1674,35 @@ void zlsum_bmod_inv
 								// printf("xre: %f\n",x[ii+i]);
 								// fflush(stdout);
 							// }
-							if(UBtree_ptr[lk1]!=NULL){							
+							if(UBtree_ptr[lk1]!=NULL){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 							nroot_send_tmp = ++nroot_send[0];
-							root_send[(nroot_send_tmp-1)*aln_i] = lk1;						
-							// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'z'); 
-							} 
+							root_send[(nroot_send_tmp-1)*aln_i] = lk1;
+							// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'z');
+							}
 
 							/*
 							 * Perform local block modifications.
 							 */
 							if ( Urbs[lk1] ){
 								// #ifdef _OPENMP
-								// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,Urbs2,lsum,stat,nrhs,grid,xsup) untied 
+								// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
 								// #endif
 								{
-								zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2,
+								zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 										Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-										send_req, stat, root_send, nroot_send, sizelsum,sizertemp,thread_id1,num_thread);
+										stat, root_send, nroot_send, sizelsum,sizertemp,thread_id1,num_thread);
 								}
 							}
 						// } /* if brecv[ik] == 0 */
 					}
-				} /* if bmod[ik] == 0 */				
-			}				
+				} /* if bmod[ik] == 0 */
+			}
 		}
 
-	} else { 
+	} else {
 
 		rtemp_loc = &rtemp[sizertemp* thread_id];
 
@@ -1721,7 +1720,7 @@ void zlsum_bmod_inv
 
 #if ( PROFlevel>=1 )
 		TIC(t1);
-#endif					
+#endif
 			RHS_ITERATE(j) {
 				dest = &lsum[il + j*iknsupc+sizelsum*thread_id];
 				y = &xk[j*knsupc];
@@ -1731,10 +1730,10 @@ void zlsum_bmod_inv
 					if ( fnz < iklrow ) { /* Nonzero segment. */
 						/* AXPY */
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif							
+						#pragma omp simd
+						#endif
 						for (irow = fnz; irow < iklrow; ++irow)
-						
+
 								{
 								zz_mult(&temp, &uval[uptr], &y[jj]);
 								z_sub(&dest[irow - ikfrow], &dest[irow - ikfrow],
@@ -1749,11 +1748,11 @@ void zlsum_bmod_inv
 #if ( PROFlevel>=1 )
 		TOC(t2, t1);
 		stat[thread_id]->utime[SOL_GEMM] += t2;
-#endif				
-			
+#endif
+
 	#ifdef _OPENMP
 	#pragma omp atomic capture
-	#endif		
+	#endif
 			bmod_tmp=--bmod[ik*aln_i];
 
 			if ( bmod_tmp == 0 ) { /* Local accumulation done. */
@@ -1763,9 +1762,9 @@ void zlsum_bmod_inv
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif						
-						for (jj=0;jj<iknsupc*nrhs;jj++)		
+						#pragma omp simd
+						#endif
+						for (jj=0;jj<iknsupc*nrhs;jj++)
 							z_add(&lsum[il + jj ],
 								  &lsum[il + jj ],
 								  &lsum[il + jj + ii*sizelsum]);
@@ -1773,7 +1772,7 @@ void zlsum_bmod_inv
 #pragma omp atomic capture
 #endif
 					nroot_send_tmp = ++nroot_send[0];
-					root_send[(nroot_send_tmp-1)*aln_i] = -ik-1;					
+					root_send[(nroot_send_tmp-1)*aln_i] = -ik-1;
 					// RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],'z');
 
 	#if ( DEBUGlevel>=2 )
@@ -1781,16 +1780,16 @@ void zlsum_bmod_inv
 							iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
 	#endif
 				} else { /* Diagonal process: X[i] += lsum[i]. */
-					
+
 #if ( PROFlevel>=1 )
 					TIC(t1);
-#endif							
-					
+#endif
+
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif						
+						#pragma omp simd
+						#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 								z_add(&lsum[il + jj ],
 									  &lsum[il + jj ],
@@ -1798,16 +1797,16 @@ void zlsum_bmod_inv
 
 					ii = X_BLK( ik );
 					dest = &x[ii];
-							
+
 					RHS_ITERATE(j)
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif					
+						#pragma omp simd
+						#endif
 						for (i = 0; i < iknsupc; ++i)
 							z_add(&dest[i + j*iknsupc],
 								  &dest[i + j*iknsupc],
 								  &lsum[i + il + j*iknsupc]);
-					
+
 					// if ( !brecv[ik] ) { /* Becomes a leaf node. */
 						// bmod[ik] = -1; /* Do not solve X[k] in the future. */
 						lk1 = LBj( gik, grid ); /* Local block number. */
@@ -1816,7 +1815,7 @@ void zlsum_bmod_inv
 						nsupr = lsub[1];
 
 						if(Llu->inv == 1){
-							Uinv = Llu->Uinv_bc_ptr[lk1];  
+							Uinv = Llu->Uinv_bc_ptr[lk1];
 	#ifdef _CRAY
 							CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
 									&alpha, Uinv, &iknsupc, &x[ii],
@@ -1829,30 +1828,30 @@ void zlsum_bmod_inv
 							zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
 									&alpha, Uinv, &iknsupc, &x[ii],
 									&iknsupc, &beta, rtemp_loc, &iknsupc );
-	#endif	
+	#endif
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif	
+							#pragma omp simd
+							#endif
 							for (i=0 ; i<iknsupc*nrhs ; i++){
 								z_copy(&x[ii+i],&rtemp_loc[i]);
-							}		
+							}
 						}else{
 	#ifdef _CRAY
 							CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
 									lusup, &nsupr, &x[ii], &iknsupc);
 	#elif defined (USE_VENDOR_BLAS)
-							ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
-									lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);	
+							ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+									lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
 	#else
-							ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+							ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 									lusup, &nsupr, &x[ii], &iknsupc);
 	#endif
 						}
-				
+
 	#if ( PROFlevel>=1 )
 						TOC(t2, t1);
 						stat[thread_id]->utime[SOL_TRSM] += t2;
-	#endif	
+	#endif
 						stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs
 						+ 10 * knsupc * nrhs; /* complex division */
 	#if ( DEBUGlevel>=2 )
@@ -1872,28 +1871,28 @@ void zlsum_bmod_inv
 #pragma omp atomic capture
 #endif
 						nroot_send_tmp = ++nroot_send[0];
-						root_send[(nroot_send_tmp-1)*aln_i] = lk1;						
-						// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'z'); 
-						} 
+						root_send[(nroot_send_tmp-1)*aln_i] = lk1;
+						// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'z');
+						}
 
 						/*
 						 * Perform local block modifications.
 						 */
 						if ( Urbs[lk1] )
-						
+
 							// if(Urbs[lk1]>16){
 							// #ifdef _OPENMP
-							// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,Urbs2,lsum,stat,nrhs,grid,xsup) untied 
-							// #endif						
-							// 	zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2,
+							// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+							// #endif
+							// 	zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 									//	Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-									//	send_req, stat, root_send, nroot_send, sizelsum,sizertemp);
+									//	stat, root_send, nroot_send, sizelsum,sizertemp);
 							//}else{
-								zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2,
+								zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 										Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-										send_req, stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread);					
-							//}		
-									
+										stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread);
+							//}
+
 					// } /* if brecv[ik] == 0 */
 				}
 			} /* if bmod[ik] == 0 */
@@ -1917,13 +1916,11 @@ void zlsum_bmod_inv_master
  int_t  k,            /* The k-th component of X.                       */
  int_t  *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
- int_t  *Urbs2,
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
  int_t  *xsup,
  gridinfo_t *grid,
  LocalLU_t *Llu,
- MPI_Request send_req[], /* input/output */
  SuperLUStat_t **stat,
  int_t sizelsum,
  int_t sizertemp,
@@ -1948,39 +1945,37 @@ void zlsum_bmod_inv_master
 	int_t  *brecv = Llu->brecv;
 	int_t  **bsendx_plist = Llu->bsendx_plist;
 	BcTree  *UBtree_ptr = Llu->UBtree_ptr;
-	RdTree  *URtree_ptr = Llu->URtree_ptr;	
+	RdTree  *URtree_ptr = Llu->URtree_ptr;
 	MPI_Status status;
 	int test_flag;
 	int_t bmod_tmp;
 	int thread_id1;
 	doublecomplex *rtemp_loc;
-	doublecomplex temp;	
-	doublecomplex *Uinv;/* Inverse of diagonal block */    
+	doublecomplex temp;
+	doublecomplex *Uinv;/* Inverse of diagonal block */
 
 	double t1, t2;
 	float msg_vol = 0, msg_cnt = 0;
-	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; 
-	int_t iword = sizeof(int_t);	
-	int_t dword = sizeof (double);		
+	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (double);
 	int_t aln_d,aln_i;
 	aln_d = ceil(CACHELINE/(double)dword);
 	aln_i = ceil(CACHELINE/(double)iword);
-		
-	
+
+
 	rtemp_loc = &rtemp[sizertemp* thread_id];
-	
-	
+
+
 	iam = grid->iam;
 	myrow = MYROW( iam, grid );
 	knsupc = SuperSize( k );
 	lk = LBj( k, grid ); /* Local block number, column-wise. */
-	nub = Urbs[lk];      /* Number of U blocks in block column lk */	
+	nub = Urbs[lk];      /* Number of U blocks in block column lk */
 
-	
-	 
 	// printf("Urbs2[lk] %5d lk %5d nub %5d\n",Urbs2[lk],lk,nub);
 	// fflush(stdout);
-	
+
 	if(nub>num_thread){
 	// if(nub>0){
 		Nchunk=num_thread;
@@ -1988,28 +1983,28 @@ void zlsum_bmod_inv_master
 		remainder = nub % Nchunk;
 
 //#ifdef _OPENMP
-//#pragma	omp	taskloop firstprivate (send_req,stat) private (thread_id1,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied	
-//#endif	
+//#pragma	omp	taskloop firstprivate (stat) private (thread_id1,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied
+//#endif
 		for (nn=0;nn<Nchunk;++nn){
 
-#ifdef _OPENMP				 
+#ifdef _OPENMP
 			thread_id1 = omp_get_thread_num ();
 #else
 			thread_id1 = 0;
-#endif		
+#endif
 			rtemp_loc = &rtemp[sizertemp* thread_id1];
 
 #if ( PROFlevel>=1 )
 			TIC(t1);
-#endif				
-			
+#endif
+
 			if(nn<remainder){
 				lbstart = nn*(nub_loc+1);
 				lbend = (nn+1)*(nub_loc+1);
 			}else{
 				lbstart = remainder+nn*nub_loc;
 				lbend = remainder + (nn+1)*nub_loc;
-			}			
+			}
 			for (ub = lbstart; ub < lbend; ++ub){
 				ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
 				usub = Llu->Ufstnz_br_ptr[ik];
@@ -2020,8 +2015,8 @@ void zlsum_bmod_inv_master
 				gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
 				iknsupc = SuperSize( gik );
 				ikfrow = FstBlockC( gik );
-				iklrow = FstBlockC( gik+1 );				
-				
+				iklrow = FstBlockC( gik+1 );
+
 				RHS_ITERATE(j) {
 					dest = &lsum[il + j*iknsupc+sizelsum*thread_id1];
 					y = &xk[j*knsupc];
@@ -2031,8 +2026,8 @@ void zlsum_bmod_inv_master
 						if ( fnz < iklrow ) { /* Nonzero segment. */
 							/* AXPY */
 							#ifdef _OPENMP
-							#pragma omp simd							
-							#endif							
+							#pragma omp simd
+							#endif
 							for (irow = fnz; irow < iklrow; ++irow)
 								{
 								zz_mult(&temp, &uval[uptr], &y[jj]);
@@ -2041,7 +2036,7 @@ void zlsum_bmod_inv_master
 								++uptr;
 								}
 							stat[thread_id1]->ops[SOLVE] += 8 * (iklrow - fnz);
-							
+
 						}
 					} /* for jj ... */
 				}
@@ -2049,14 +2044,14 @@ void zlsum_bmod_inv_master
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat[thread_id1]->utime[SOL_GEMM] += t2;
-#endif	
+#endif
 		}
-				
-	}else{	
+
+	}else{
 		rtemp_loc = &rtemp[sizertemp* thread_id];
 #if ( PROFlevel>=1 )
 		TIC(t1);
-#endif	
+#endif
 		for (ub = 0; ub < nub; ++ub) {
 			ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
 			usub = Llu->Ufstnz_br_ptr[ik];
@@ -2068,7 +2063,7 @@ void zlsum_bmod_inv_master
 			iknsupc = SuperSize( gik );
 			ikfrow = FstBlockC( gik );
 			iklrow = FstBlockC( gik+1 );
-				
+
 			RHS_ITERATE(j) {
 				dest = &lsum[il + j*iknsupc+sizelsum*thread_id];
 				y = &xk[j*knsupc];
@@ -2078,8 +2073,8 @@ void zlsum_bmod_inv_master
 					if ( fnz < iklrow ) { /* Nonzero segment. */
 						/* AXPY */
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif						
+						#pragma omp simd
+						#endif
 						for (irow = fnz; irow < iklrow; ++irow)
 							{
 							zz_mult(&temp, &uval[uptr], &y[jj]);
@@ -2088,19 +2083,19 @@ void zlsum_bmod_inv_master
 							++uptr;
 							}
 						stat[thread_id]->ops[SOLVE] += 8 * (iklrow - fnz);
-						
+
 					}
 				} /* for jj ... */
-			}			
-		}	
+			}
+		}
 #if ( PROFlevel>=1 )
 		TOC(t2, t1);
 		stat[thread_id]->utime[SOL_GEMM] += t2;
-#endif				
+#endif
 	}
 
-	
-	rtemp_loc = &rtemp[sizertemp* thread_id];	
+
+	rtemp_loc = &rtemp[sizertemp* thread_id];
 	for (ub = 0; ub < nub; ++ub){
 		ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
 		il = LSUM_BLK( ik );
@@ -2109,9 +2104,9 @@ void zlsum_bmod_inv_master
 
 	// #ifdef _OPENMP
 	// #pragma omp atomic capture
-	// #endif		
+	// #endif
 		bmod_tmp=--bmod[ik*aln_i];
-		
+
 		if ( bmod_tmp == 0 ) { /* Local accumulation done. */
 			gikcol = PCOL( gik, grid );
 			p = PNUM( myrow, gikcol, grid );
@@ -2119,8 +2114,8 @@ void zlsum_bmod_inv_master
 				for (ii=1;ii<num_thread;ii++)
 					// if(ii!=thread_id)
 					#ifdef _OPENMP
-					#pragma omp simd							
-					#endif					
+					#pragma omp simd
+					#endif
 					for (jj=0;jj<iknsupc*nrhs;jj++)
 						z_add(&lsum[il + jj ],
 							  &lsum[il + jj ],
@@ -2132,15 +2127,15 @@ void zlsum_bmod_inv_master
 						iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
 #endif
 			} else { /* Diagonal process: X[i] += lsum[i]. */
-				
+
 #if ( PROFlevel>=1 )
 				TIC(t1);
-#endif								
+#endif
 				for (ii=1;ii<num_thread;ii++)
 					// if(ii!=thread_id)
 					#ifdef _OPENMP
-					#pragma omp simd							
-					#endif							
+					#pragma omp simd
+					#endif
 					for (jj=0;jj<iknsupc*nrhs;jj++)
 						z_add(&lsum[il + jj ],
 							  &lsum[il + jj ],
@@ -2148,16 +2143,16 @@ void zlsum_bmod_inv_master
 
 				ii = X_BLK( ik );
 				dest = &x[ii];
-						
+
 				RHS_ITERATE(j)
 					#ifdef _OPENMP
-					#pragma omp simd							
-					#endif						
+					#pragma omp simd
+					#endif
 					for (i = 0; i < iknsupc; ++i)
 						z_add(&dest[i + j*iknsupc],
 							  &dest[i + j*iknsupc],
 							  &lsum[i + il + j*iknsupc]);
-						
+
 				// if ( !brecv[ik] ) { /* Becomes a leaf node. */
 					// bmod[ik] = -1; /* Do not solve X[k] in the future. */
 					lk1 = LBj( gik, grid ); /* Local block number. */
@@ -2166,7 +2161,7 @@ void zlsum_bmod_inv_master
 					nsupr = lsub[1];
 
 					if(Llu->inv == 1){
-						Uinv = Llu->Uinv_bc_ptr[lk1];  
+						Uinv = Llu->Uinv_bc_ptr[lk1];
 #ifdef _CRAY
 						CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
 								&alpha, Uinv, &iknsupc, &x[ii],
@@ -2179,30 +2174,30 @@ void zlsum_bmod_inv_master
 						zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
 								&alpha, Uinv, &iknsupc, &x[ii],
 								&iknsupc, &beta, rtemp_loc, &iknsupc );
-#endif	
+#endif
 						#ifdef _OPENMP
-						#pragma omp simd							
-						#endif		   
+						#pragma omp simd
+						#endif
 						for (i=0 ; i<iknsupc*nrhs ; i++){
 							z_copy(&x[ii+i],&rtemp_loc[i]);
-						}		
+						}
 					}else{
 #ifdef _CRAY
 						CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
 								lusup, &nsupr, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-						ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
-								lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);	
+						ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+								lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-						ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
+						ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 								lusup, &nsupr, &x[ii], &iknsupc);
 #endif
 					}
-			
+
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
 					stat[thread_id]->utime[SOL_TRSM] += t2;
-#endif					
+#endif
 					stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs
 					+ 10 * knsupc * nrhs; /* complex division */
 #if ( DEBUGlevel>=2 )
@@ -2218,25 +2213,25 @@ void zlsum_bmod_inv_master
 						// fflush(stdout);
 					// }
 					if(UBtree_ptr[lk1]!=NULL){
-					BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'z')*nrhs+XK_H,'z'); 
-					} 
+					BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'z')*nrhs+XK_H,'z');
+					}
 
 					/*
 					 * Perform local block modifications.
 					 */
 					if ( Urbs[lk1] ){
 						// #ifdef _OPENMP
-						// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,Urbs2,lsum,stat,nrhs,grid,xsup) untied 
+						// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
 						// #endif
 						{
-						zlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2,
+						zlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 								Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-								send_req, stat, sizelsum,sizertemp,thread_id,num_thread);
+								stat, sizelsum,sizertemp,thread_id,num_thread);
 						}
 					}
 				// } /* if brecv[ik] == 0 */
 			}
-		} /* if bmod[ik] == 0 */		
-	}	
-	
+		} /* if bmod[ik] == 0 */
+	}
+
 } /* zlsum_bmod_inv_master */
diff -pruN 6.1.0+dfsg1-1/SRC/pzlangs.c 6.1.1+dfsg1-1/SRC/pzlangs.c
--- 6.1.0+dfsg1-1/SRC/pzlangs.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzlangs.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Returns the value of the one norm, or the Frobenius norm, or the infinity norm, or the element of largest value
  *
  * <pre>
@@ -22,47 +22,47 @@ at the top-level directory.
 
 /*! \brief
 
-<pre> 
-    Purpose   
-    =======   
-
-    PZLANGS returns the value of the one norm, or the Frobenius norm, or 
-    the infinity norm, or the element of largest absolute value of a 
-    real matrix A.   
-
-    Description   
-    ===========   
-
-    PZLANGE returns the value   
-
-       PZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
-                 (   
-                 ( norm1(A),         NORM = '1', 'O' or 'o'   
-                 (   
-                 ( normI(A),         NORM = 'I' or 'i'   
-                 (   
-                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
-
-    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
-    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
-    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
-    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+<pre>
+    Purpose
+    =======
+
+    PZLANGS returns the value of the one norm, or the Frobenius norm, or
+    the infinity norm, or the element of largest absolute value of a
+    real matrix A.
+
+    Description
+    ===========
+
+    PZLANGE returns the value
+
+       PZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                 (
+                 ( norm1(A),         NORM = '1', 'O' or 'o'
+                 (
+                 ( normI(A),         NORM = 'I' or 'i'
+                 (
+                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.
 
-    Arguments   
-    =========   
+    Arguments
+    =========
 
-    NORM    (input) CHARACTER*1   
-            Specifies the value to be returned in DLANGE as described above.   
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in DLANGE as described above.
     A       (input) SuperMatrix*
-            The M by N sparse matrix A. 
+            The M by N sparse matrix A.
     GRID    (input) gridinof_t*
             The 2D process mesh.
-   ===================================================================== 
+   =====================================================================
 </pre>
 */
 
 double pzlangs(char *norm, SuperMatrix *A, gridinfo_t *grid)
-{   
+{
     /* Local variables */
     NRformat_loc *Astore;
     int_t    m_loc;
@@ -76,7 +76,7 @@ double pzlangs(char *norm, SuperMatrix *
     Astore = (NRformat_loc *) A->Store;
     m_loc = Astore->m_loc;
     Aval   = (doublecomplex *) Astore->nzval;
-    
+
     if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
 	value = 0.;
     } else if ( strncmp(norm, "M", 1)==0 ) {
@@ -96,7 +96,7 @@ double pzlangs(char *norm, SuperMatrix *
 #if 0
 	for (j = 0; j < A->ncol; ++j) {
 	    sum = 0.;
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
 		sum += fabs(Aval[i]);
 	    value = SUPERLU_MAX(value,sum);
 	}
@@ -119,7 +119,7 @@ double pzlangs(char *norm, SuperMatrix *
 	}
 	SUPERLU_FREE (temprwork);
 	SUPERLU_FREE (rwork);
-#endif	
+#endif
     } else if ( strncmp(norm, "I", 1)==0 ) {
 	/* Find normI(A). */
 	value = 0.;
@@ -138,7 +138,7 @@ double pzlangs(char *norm, SuperMatrix *
     } else {
 	ABORT("Illegal norm specified.");
     }
-    
+
     return (value);
 
 } /* pzlangs */
diff -pruN 6.1.0+dfsg1-1/SRC/pzlaqgs.c 6.1.1+dfsg1-1/SRC/pzlaqgs.c
--- 6.1.0+dfsg1-1/SRC/pzlaqgs.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzlaqgs.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Equilibrates a general sparse M by N matrix
  *
  * <pre>
@@ -23,70 +23,70 @@ at the top-level directory.
 /*! \brief
 
 <pre>
-    Purpose   
-    =======   
+    Purpose
+    =======
 
     PZLAQGS equilibrates a general sparse M by N matrix A using the row
-    and column scaling factors in the vectors R and C.   
+    and column scaling factors in the vectors R and C.
 
     See supermatrix.h for the definition of 'SuperMatrix' structure.
 
-    Arguments   
-    =========   
+    Arguments
+    =========
 
     A       (input/output) SuperMatrix*
-            On exit, the equilibrated matrix.  See EQUED for the form of 
+            On exit, the equilibrated matrix.  See EQUED for the form of
             the equilibrated matrix. The type of A can be:
 	    Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
-	    
+
     R       (input) double*, dimension (A->nrow)
             The row scale factors for A.
-	    
+
     C       (input) double*, dimension (A->ncol)
             The column scale factors for A.
-	    
+
     ROWCND  (input) double
             Ratio of the smallest R(i) to the largest R(i).
-	    
+
     COLCND  (input) double
             Ratio of the smallest C(i) to the largest C(i).
-	    
+
     AMAX    (input) double
             Absolute value of largest matrix entry.
-	    
+
     EQUED   (output) char*
-            Specifies the form of equilibration that was done.   
-            = 'N':  No equilibration   
-            = 'R':  Row equilibration, i.e., A has been premultiplied by  
-                    diag(R).   
-            = 'C':  Column equilibration, i.e., A has been postmultiplied  
-                    by diag(C).   
+            Specifies the form of equilibration that was done.
+            = 'N':  No equilibration
+            = 'R':  Row equilibration, i.e., A has been premultiplied by
+                    diag(R).
+            = 'C':  Column equilibration, i.e., A has been postmultiplied
+                    by diag(C).
             = 'B':  Both row and column equilibration, i.e., A has been
-                    replaced by diag(R) * A * diag(C).   
+                    replaced by diag(R) * A * diag(C).
 
-    Internal Parameters   
-    ===================   
+    Internal Parameters
+    ===================
 
-    THRESH is a threshold value used to decide if row or column scaling   
-    should be done based on the ratio of the row or column scaling   
-    factors.  If ROWCND < THRESH, row scaling is done, and if   
-    COLCND < THRESH, column scaling is done.   
+    THRESH is a threshold value used to decide if row or column scaling
+    should be done based on the ratio of the row or column scaling
+    factors.  If ROWCND < THRESH, row scaling is done, and if
+    COLCND < THRESH, column scaling is done.
 
-    LARGE and SMALL are threshold values used to decide if row scaling   
-    should be done based on the absolute size of the largest matrix   
-    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+    LARGE and SMALL are threshold values used to decide if row scaling
+    should be done based on the absolute size of the largest matrix
+    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.
 
-    ===================================================================== 
+    =====================================================================
 </pre>
 */
 
 void
-pzlaqgs(SuperMatrix *A, double *r, double *c, 
+pzlaqgs(SuperMatrix *A, double *r, double *c,
        double rowcnd, double colcnd, double amax, char *equed)
 {
 
 #define THRESH    (0.1)
-    
+
     /* Local variables */
     NRformat_loc *Astore;
     doublecomplex *Aval;
@@ -103,7 +103,7 @@ pzlaqgs(SuperMatrix *A, double *r, doubl
     Astore = A->Store;
     Aval = Astore->nzval;
     m_loc = Astore->m_loc;
-    
+
     /* Initialize LARGE and SMALL. */
     small = dmach_dist("Safe minimum") / dmach_dist("Precision");
     large = 1. / small;
diff -pruN 6.1.0+dfsg1-1/SRC/pzsymbfact_distdata.c 6.1.1+dfsg1-1/SRC/pzsymbfact_distdata.c
--- 6.1.0+dfsg1-1/SRC/pzsymbfact_distdata.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzsymbfact_distdata.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,20 +1,20 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Redistribute the symbolic structure of L and U from the distribution
  *
  * <pre>
  * -- Parallel symbolic factorization auxialiary routine (version 2.3) --
- * -- Distributes the data from parallel symbolic factorization 
+ * -- Distributes the data from parallel symbolic factorization
  * -- to numeric factorization
  * INRIA France -  July 1, 2004
  * Laura Grigori
@@ -37,12 +37,12 @@ at the top-level directory.
  * <pre>
  * Purpose
  * =======
- * 
+ *
  * Redistribute the symbolic structure of L and U from the distribution
  * used in the parallel symbolic factorization step to the distdibution
  * used in the parallel numeric factorization step.  On exit, the L and U
  * structure for the 2D distribution used in the numeric factorization step is
- * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal 
+ * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal
  * information is also computed and it is stored in Glu_persist->supno
  * and Glu_persist->xsup.
  *
@@ -51,11 +51,11 @@ at the top-level directory.
  * p_xlsub, p_lsub, p_xusub, p_usub,
  * Glu_persist->supno,  Glu_persist->xsup.
  *
- * This routine also deallocates memory allocated during symbolic 
+ * This routine also deallocates memory allocated during symbolic
  * factorization routine.  That is, the folloing arrays are freed:
- * Pslu_freeable->xlsub,  Pslu_freeable->lsub, 
- * Pslu_freeable->xusub, Pslu_freeable->usub, 
- * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, 
+ * Pslu_freeable->xlsub,  Pslu_freeable->lsub,
+ * Pslu_freeable->xusub, Pslu_freeable->usub,
+ * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc,
  * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
  *
  * Arguments
@@ -64,28 +64,28 @@ at the top-level directory.
  * n      (Input) int_t
  *        Order of the input matrix
  * Pslu_freeable  (Input) Pslu_freeable_t *
- *        Local L and U structure, 
+ *        Local L and U structure,
  *        global to local indexing information.
- * 
+ *
  * Glu_persist (Output) Glu_persist_t *
  *        Stores on output the information on supernodes mapping.
- * 
+ *
  * p_xlsub (Output) int_t **
- *         Pointer to structure of L distributed on a 2D grid 
+ *         Pointer to structure of L distributed on a 2D grid
  *         of processors, stored by columns.
- * 
+ *
  * p_lsub  (Output) int_t **
- *         Structure of L distributed on a 2D grid of processors, 
+ *         Structure of L distributed on a 2D grid of processors,
  *         stored by columns.
  *
  * p_xusub (Output) int_t **
- *         Pointer to structure of U distributed on a 2D grid 
+ *         Pointer to structure of U distributed on a 2D grid
  *         of processors, stored by rows.
- * 
+ *
  * p_usub  (Output) int_t **
- *         Structure of U distributed on a 2D grid of processors, 
+ *         Structure of U distributed on a 2D grid of processors,
  *         stored by rows.
- * 
+ *
  * grid   (Input) gridinfo_t*
  *        The 2D process mesh.
  *
@@ -98,14 +98,14 @@ at the top-level directory.
  */
 
 static float
-dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, 
-	     Glu_persist_t *Glu_persist, 
+dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable,
+	     Glu_persist_t *Glu_persist,
 	     int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
 	     gridinfo_t *grid
 	     )
 {
   int   iam, nprocs, pc, pr, p, np, p_diag;
-  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, 
+  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u,
     *tmp_ptrToSend, *mem;
   int_t *nnzToRecv_l, *nnzToRecv_u;
   int_t *send_1, *send_2, nsend_1, nsend_2;
@@ -125,7 +125,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   float memAux;  /* Memory used during this routine and freed on return */
   float memRet; /* Memory allocated and not freed on return */
   int_t iword, dword;
-  
+
   /* ------------------------------------------------------------
      INITIALIZATION.
      ------------------------------------------------------------*/
@@ -146,7 +146,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   iword = sizeof(int_t);
   dword = sizeof(doublecomplex);
   memAux = 0.; memRet = 0.;
-  
+
   mem           = intCalloc_dist(12 * nprocs);
   if (!mem)
     return (ERROR_RET);
@@ -160,7 +160,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   tmp_ptrToSend = send_2 + nprocs;
   nnzToRecv_l   = tmp_ptrToSend + nprocs;
   nnzToRecv_u   = nnzToRecv_l + nprocs;
-  
+
   ptrToSend = nnzToSend;
   ptrToRecv = nnzToSend + nprocs;
 
@@ -172,7 +172,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   memAux += 5 * nprocs * sizeof(int);
 
   maxszsn   = sp_ienv_dist(3);
-  
+
   /* Allocate space for storing Glu_persist_n. */
   if ( !(supno_n = intMalloc_dist(n+1)) ) {
     fprintf (stderr, "Malloc fails for supno_n[].");
@@ -183,7 +183,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   /* ------------------------------------------------------------
      DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION
      ------------------------------------------------------------*/
-  
+
   if (nvtcs_loc > INT_MAX)
     ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n");
   intNvtcs_loc = (int) nvtcs_loc;
@@ -198,7 +198,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       k += nvtcs[p];
     }
   }
-  
+
   if (nprocs > 1) {
     temp = NULL;
     if (!iam ) {
@@ -217,7 +217,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 #else  /* Default */
     intBuf1 = ptrToRecv;
 #endif
-    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, 
+    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t,
 		 temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm);
   }
   else
@@ -254,7 +254,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
   /* reset to 0 nnzToSend */
   for (p = 0; p < 2 *nprocs; p++)
     nnzToSend[p] = 0;
-  
+
   MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm);
   nsupers = supno_n[n];
   /* Allocate space for storing Glu_persist_n. */
@@ -262,7 +262,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
     fprintf (stderr, "Malloc fails for xsup_n[].");
     return (memAux + memRet);
   }
-  memRet += (float) (nsupers+1) * iword;  
+  memRet += (float) (nsupers+1) * iword;
 
   /* ------------------------------------------------------------
      COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
@@ -278,7 +278,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
     }
   }
   xsup_n[nsupers] = n;
-  
+
   for (p = 0; p < nprocs; p++) {
     send_1[p] = FALSE;
     send_2[p] = FALSE;
@@ -289,7 +289,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       pc = PCOL( gb_n, grid );
       pr = PROW( gb_n, grid );
       p_diag = PNUM( pr, pc, grid);
-      
+
       i_loc = LOCAL_IND( globToLoc[i] );
       gb_s  = supno_s[i_loc];
       fst_s = xsup_beg_s[gb_s];
@@ -309,17 +309,17 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) {
 	  gb = supno_n[k];
 	  p = PNUM( pr, PCOL(gb, grid), grid);
-	  nnzToSend[2*p+1] ++;	
+	  nnzToSend[2*p+1] ++;
 	  send_2[p] = TRUE;
 	}
       }
-      
+
       nsend_2 = 0;
       for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
 	nnzToSend[2*p+1] += 2;
-	if (send_2[p])  nsend_2 ++;	  
+	if (send_2[p])  nsend_2 ++;
       }
-      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) 
+      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++)
 	if (send_2[p] || p == p_diag) {
 	  if (p == p_diag && !send_2[p])
 	    nnzToSend[2*p+1] += nsend_2;
@@ -332,7 +332,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	nnzToSend[2*p] += 2;
 	if (send_1[p]) nsend_1 ++;
       }
-      for (p = pc; p < nprocs; p += grid->npcol) 
+      for (p = pc; p < nprocs; p += grid->npcol)
 	if (send_1[p]) {
 	  nnzToSend[2*p] += nsend_1-1;
 	  send_1[p] = FALSE;
@@ -341,28 +341,28 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	  nnzToSend[2*p] += nsend_1;
     }
   }
-  
+
   /* All-to-all communication */
   MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t,
 		grid->comm);
-  
+
   nnz_loc_l = nnz_loc_u = 0;
-  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;  
+  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;
   for (p = 0; p < nprocs; p++) {
     if ( p != iam ) {
       SendCnt_l += nnzToSend[2*p];   nnzToSend_l[p] = nnzToSend[2*p];
-      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; 
+      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1];
       RecvCnt_l += nnzToRecv[2*p];   nnzToRecv_l[p] = nnzToRecv[2*p];
       RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1];
     } else {
       nnz_loc_l += nnzToRecv[2*p];
       nnz_loc_u += nnzToRecv[2*p+1];
       nnzToSend_l[p] = 0; nnzToSend_u[p] = 0;
-      nnzToRecv_l[p] = nnzToRecv[2*p]; 
+      nnzToRecv_l[p] = nnzToRecv[2*p];
       nnzToRecv_u[p] = nnzToRecv[2*p+1];
     }
   }
-  
+
   /* Allocate space for storing the symbolic structure after redistribution. */
   nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
   nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
@@ -376,16 +376,16 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
     fprintf (stderr, "Malloc fails for xusub_n[].");
     return (memAux + memRet);
   }
-  memRet += (float) (nsupers_i+1) * iword;  
+  memRet += (float) (nsupers_i+1) * iword;
 
   /* Allocate temp storage for sending/receiving the L/U symbolic structure. */
   if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) {
-    if (!(rcv_luind = 
+    if (!(rcv_luind =
 	  intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) {
       fprintf (stderr, "Malloc fails for rcv_luind[].");
       return (memAux + memRet);
     }
-    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) 
+    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u)
       * iword;
   }
   if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
@@ -394,8 +394,8 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       return (memAux + memRet);
     }
     memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
-  } 
-  
+  }
+
   /* ------------------------------------------------------------------
      LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND.
      THIS ACCOUNTS FOR THE SECOND PASS OF L and U.
@@ -418,16 +418,16 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       ptrToRecv[p] = j;  j += nnzToRecv[p];
     }
     nnzToRecv[iam] = 0;
-    
+
     ind_loc = ptrToRecv[iam];
     for (gb_n = 0; gb_n < nsupers; gb_n++) {
-      nsend_2 = 0;    
+      nsend_2 = 0;
       i = xsup_n[gb_n];
       if (iam == OWNER( globToLoc[i] )) {
 	pc = PCOL( gb_n, grid );
 	pr = PROW( gb_n, grid );
 	p_diag = PNUM( pr, pc, grid );
-	
+
 	i_loc = LOCAL_IND( globToLoc[i] );
 	gb_s  = supno_s[i_loc];
 	fst_s = xsup_beg_s[gb_s];
@@ -435,7 +435,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	fst_s_l = LOCAL_IND( globToLoc[fst_s] );
 
 	if (sendL) {
-	  p = pc;                np = grid->nprow;	  
+	  p = pc;                np = grid->nprow;
 	} else {
 	  p = pr * grid->npcol;  np = grid->npcol;
 	}
@@ -444,13 +444,13 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	    rcv_luind[ind_loc] = gb_n;
 	    rcv_luind[ind_loc+1] = 0;
 	    tmp_ptrToSend[p] = ind_loc + 1;
-	    ind_loc += 2;	 
+	    ind_loc += 2;
 	  }
 	  else {
 	    snd_luind[ptrToSend[p]] = gb_n;
 	    snd_luind[ptrToSend[p]+1] = 0;
 	    tmp_ptrToSend[p] = ptrToSend[p] + 1;
-	    ptrToSend[p] += 2;	 
+	    ptrToSend[p] += 2;
 	  }
 	  if (sendL) p += grid->npcol;
 	  if (sendU) p++;
@@ -461,7 +461,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	    gb = supno_n[k];
 	    if (sendL)
 	      p = PNUM( PROW(gb, grid), pc, grid );
-	    else 
+	    else
 	      p = PNUM( pr, PCOL(gb, grid), grid);
 	    if (send_1[p] == FALSE) {
 	      send_1[p] = TRUE;
@@ -496,10 +496,10 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 		}
 	      }
 	      send_1[p] = FALSE;
-	  }  
+	  }
 	if (sendU)
 	  for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
-	    if (send_1[p] || p == p_diag) {	      
+	    if (send_1[p] || p == p_diag) {
 	      for (k = 0; k < nsend_2; k++) {
 		gb = supno_n[send_2[k]];
 		if(PNUM( pr, PCOL(gb, grid), grid) != p) {
@@ -510,15 +510,15 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 		  else {
 		    snd_luind[ptrToSend[p]] = send_2[k];
 		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-		  }	     
+		  }
 		}
-	      } 
+	      }
 	      send_1[p] = FALSE;
 	    }
 	  }
       }
     }
-    
+
     /* reset ptrToSnd to point to the beginning of the data for
        each processor (structure needed in MPI_Alltoallv) */
     for (i = 0, p = 0; p < nprocs; p++) {
@@ -546,24 +546,24 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       intBuf3 = nnzToRecv;  intBuf4 = ptrToRecv;
 #endif
 
-      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, 
+      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t,
 		     rcv_luind, intBuf3, intBuf4, mpi_int_t,
 		     grid->comm);
     }
     if (sendL)
       nnzToRecv[iam] = nnz_loc_l;
-    else 
+    else
       nnzToRecv[iam] = nnz_loc_u;
-    
+
     /* ------------------------------------------------------------
        DEALLOCATE TEMPORARY STORAGE.
        -------------------------------------------------------------*/
-    if (sendU) 
+    if (sendU)
       if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
 	SUPERLU_FREE (snd_luind);
 	memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
       }
-    
+
     /* ------------------------------------------------------------
        CONVERT THE FORMAT.
        ------------------------------------------------------------*/
@@ -587,9 +587,9 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 
     if (sendL) j = nsupers_j;
     else j = nsupers_i;
-    k = 0; 
+    k = 0;
     isize = xsub_n[0];
-    xsub_n[0] = 0; 
+    xsub_n[0] = 0;
     for (gb_l = 1; gb_l < j; gb_l++) {
       k += isize;
       isize = xsub_n[gb_l];
@@ -619,7 +619,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
       }
       sub_n = usub_n;
     }
-    
+
     /* Copy the data into the L column / U row oriented storage */
     k = 0;
     for (p = 0; p < nprocs; p++) {
@@ -635,7 +635,7 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 	for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) {
 	  sub_n[j] = rcv_luind[i];
 	}
-      }      
+      }
       k += nnzToRecv[p];
     }
     if (sendL) {
@@ -650,23 +650,23 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
     SUPERLU_FREE (rcv_luind);
     memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword;
   }
-  SUPERLU_FREE (mem);  
+  SUPERLU_FREE (mem);
   memAux -= (float) (12 * nprocs * iword);
   SUPERLU_FREE(nvtcs);
   memAux -= (float) (5 * nprocs * sizeof(int));
-  
+
   if (xlsub_s != NULL) {
     SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s);
   }
   if (xusub_s != NULL) {
     SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s);
   }
-  SUPERLU_FREE (globToLoc); 
+  SUPERLU_FREE (globToLoc);
   if (supno_s != NULL) {
     SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s);
     SUPERLU_FREE (supno_s);
   }
-  
+
   Glu_persist->supno = supno_n;  Glu_persist->xsup  = xsup_n;
   *p_xlsub = xlsub_n; *p_lsub = lsub_n;
   *p_xusub = xusub_n; *p_usub = usub_n;
@@ -674,10 +674,10 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
 #if ( DEBUGlevel>=1 )
   CHECK_MALLOC(iam, "Exit dist_symbLU()");
 #endif
-  
+
   return (-memRet);
 }
- 
+
 /*! \brief
  *
  * <pre>
@@ -686,10 +686,10 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
  *   Re-distribute A on the 2D process mesh.  The lower part is
  *   stored using a column format and the upper part
  *   is stored using a row format.
- * 
+ *
  * Arguments
  * =========
- * 
+ *
  * A      (Input) SuperMatrix*
  *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
  *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
@@ -700,40 +700,40 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
  *
  * Glu_persist  (Input) Glu_persist_t *
  *        Information on supernodes mapping.
- * 
+ *
  * grid   (Input) gridinfo_t*
  *        The 2D process mesh.
  *
  * p_ainf_colptr (Output) int_t**
- *         Pointer to the lower part of A distributed on a 2D grid 
+ *         Pointer to the lower part of A distributed on a 2D grid
  *         of processors, stored by columns.
  *
  * p_ainf_rowind (Output) int_t**
- *         Structure of of the lower part of A distributed on a 
+ *         Structure of of the lower part of A distributed on a
  *         2D grid of processors, stored by columns.
  *
  * p_ainf_val    (Output) doublecomplex**
- *         Numerical values of the lower part of A, distributed on a 
+ *         Numerical values of the lower part of A, distributed on a
  *         2D grid of processors, stored by columns.
  *
  * p_asup_rowptr (Output) int_t**
- *         Pointer to the upper part of A distributed on a 2D grid 
+ *         Pointer to the upper part of A distributed on a 2D grid
  *         of processors, stored by rows.
  *
  * p_asup_colind (Output) int_t**
- *         Structure of of the upper part of A distributed on a 
+ *         Structure of of the upper part of A distributed on a
  *         2D grid of processors, stored by rows.
  *
  * p_asup_val    (Output) doublecomplex**
- *         Numerical values of the upper part of A, distributed on a 
+ *         Numerical values of the upper part of A, distributed on a
  *         2D grid of processors, stored by rows.
  *
  * ilsum_i  (Input) int_t *
- *       Starting position of each supernode in 
+ *       Starting position of each supernode in
  *       the full array (local, block row wise).
  *
  * ilsum_j  (Input) int_t *
- *       Starting position of each supernode in 
+ *       Starting position of each supernode in
  *       the full array (local, block column wise).
  *
  * Return value
@@ -743,10 +743,10 @@ dist_symbLU (int_t n, Pslu_freeable_t *P
  *        (an approximation).
  * </pre>
  */
- 
+
 static float
 zdist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
-	Glu_persist_t *Glu_persist, gridinfo_t *grid, 
+	Glu_persist_t *Glu_persist, gridinfo_t *grid,
 	int_t **p_ainf_colptr, int_t **p_ainf_rowind, doublecomplex **p_ainf_val,
 	int_t **p_asup_rowptr, int_t **p_asup_colind, doublecomplex **p_asup_val,
 	int_t *ilsum_i, int_t *ilsum_j
@@ -771,7 +771,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
   MPI_Request *send_req;
   MPI_Status  status;
   int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-  int_t *supno = Glu_persist->supno;   
+  int_t *supno = Glu_persist->supno;
   float memAux;  /* Memory used during this routine and freed on return */
   float memRet; /* Memory allocated and not freed on return */
   int_t iword, dword, szbuf;
@@ -785,7 +785,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
 #endif
   iword = sizeof(int_t);
   dword = sizeof(double);
-  
+
   perm_r = ScalePermstruct->perm_r;
   perm_c = ScalePermstruct->perm_c;
   procs = grid->nprow * grid->npcol;
@@ -800,7 +800,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
   memAux = (float) (2 * procs * iword);
   memRet = 0.;
   nnzToSend = nnzToRecv + procs;
-  nsupers  = supno[n-1] + 1;  
+  nsupers  = supno[n-1] + 1;
 
   /* ------------------------------------------------------------
      COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
@@ -814,17 +814,17 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
       gbi = BlockNum( irow );
       gbj = BlockNum( jcol );
       p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      ++nnzToSend[p]; 
+      ++nnzToSend[p];
     }
   }
-  
+
   /* All-to-all communication */
   MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
 		grid->comm);
-  
+
   maxnnzToRecv = 0;
   nnz_loc = SendCnt = RecvCnt = 0;
-  
+
   for (p = 0; p < procs; ++p) {
     if ( p != iam ) {
       SendCnt += nnzToSend[p];
@@ -850,7 +850,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
     return (memAux);
   }
   memAux += (float) (k*dword);
-  
+
   /* Allocate temporary storage for sending/receiving the A triplets. */
   if ( procs > 1 ) {
     if ( !(send_req = (MPI_Request *)
@@ -868,7 +868,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
       fprintf(stderr, "Malloc fails for aij_send[].");
       return (memAux);
     }
-    memAux += (float) (procs*sizeof(doublecomplex*));    
+    memAux += (float) (procs*sizeof(doublecomplex*));
     if ( !(index = intMalloc_dist(2*SendCnt)) ) {
       fprintf(stderr, "Malloc fails for index[].");
       return (memAux);
@@ -894,7 +894,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
       return (memAux);
     }
     memAux += (float) (maxnnzToRecv * dword);
-    
+
     for (i = 0, j = 0, p = 0; p < procs; ++p) {
       if ( p != iam ) {
 	ia_send[p] = &index[i];
@@ -904,7 +904,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
       }
     }
   } /* if procs > 1 */
-  
+
   nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
   nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
   if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) {
@@ -917,7 +917,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
     return (memAux+memRet);
   }
   memRet += (float) (ilsum_i[nsupers_i] + 1) * iword;
-  
+
   /* ------------------------------------------------------------
      LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
      THIS ACCOUNTS FOR THE SECOND PASS OF A.
@@ -932,13 +932,13 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
       gbi = BlockNum( irow );
       gbj = BlockNum( jcol );
       p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      
+
       if ( p != iam ) { /* remote */
 	k = ptr_to_send[p];
 	ia_send[p][k] = irow;
 	ia_send[p][k + nnzToSend[p]] = jcol;
 	aij_send[p][k] = nzval_a[j];
-	++ptr_to_send[p]; 
+	++ptr_to_send[p];
       } else {          /* local */
 	ia[nnz_loc] = irow;
 	ja[nnz_loc] = jcol;
@@ -968,14 +968,14 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
 		 p, iam, grid->comm, &send_req[p] );
       it = nnzToSend[p];
       MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX,
-		 p, iam+procs, grid->comm, &send_req[procs+p] ); 
+		 p, iam+procs, grid->comm, &send_req[procs+p] );
     }
   }
-  
+
   for (p = 0; p < procs; ++p) {
     if ( p != iam ) {
       it = 2*nnzToRecv[p];
-      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
+      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
       it = nnzToRecv[p];
       MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs,
 		grid->comm, &status );
@@ -987,7 +987,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
 	ja[nnz_loc] = jcol;
 	aij[nnz_loc] = dtemp[i];
 	++nnz_loc;
-	
+
 	gbi = BlockNum( irow );
 	gbj = BlockNum( jcol );
 	/* Count nonzeros in each column of L / row of U */
@@ -1002,18 +1002,18 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
       }
     }
   }
-  
+
   for (p = 0; p < procs; ++p) {
     if ( p != iam ) {
       MPI_Wait( &send_req[p], &status);
       MPI_Wait( &send_req[procs+p], &status);
     }
   }
-  
+
   /* ------------------------------------------------------------
      DEALLOCATE TEMPORARY STORAGE
      ------------------------------------------------------------*/
-  
+
   SUPERLU_FREE(nnzToRecv);
   memAux -= 2 * procs * iword;
   if ( procs > 1 ) {
@@ -1030,7 +1030,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
       SendCnt* dword + procs*iword +
       2*maxnnzToRecv*iword + maxnnzToRecv*dword;
   }
-  
+
   /* ------------------------------------------------------------
      CONVERT THE TRIPLET FORMAT.
      ------------------------------------------------------------*/
@@ -1068,11 +1068,11 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
   }
 
   /* Initialize the array of column pointers */
-  k = 0; 
-  jsize = ainf_colptr[0];  ainf_colptr[0] = 0; 
+  k = 0;
+  jsize = ainf_colptr[0];  ainf_colptr[0] = 0;
   for (j = 1; j < ilsum_j[nsupers_j]; j++) {
-    k += jsize;              
-    jsize = ainf_colptr[j];  
+    k += jsize;
+    jsize = ainf_colptr[j];
     ainf_colptr[j] = k;
   }
   ainf_colptr[ilsum_j[nsupers_j]] = k + jsize;
@@ -1080,7 +1080,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
   isize = asup_rowptr[0];  asup_rowptr[0] = 0;
   for (j = 1; j < ilsum_i[nsupers_i]; j++) {
     i += isize;
-    isize = asup_rowptr[j];  
+    isize = asup_rowptr[j];
     asup_rowptr[j] = i;
   }
   asup_rowptr[ilsum_i[nsupers_i]] = i + isize;
@@ -1109,19 +1109,19 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
   }
 
   /* Reset the column pointers to the beginning of each column */
-  for (j = ilsum_j[nsupers_j]; j > 0; j--) 
+  for (j = ilsum_j[nsupers_j]; j > 0; j--)
     ainf_colptr[j] = ainf_colptr[j-1];
-  for (j = ilsum_i[nsupers_i]; j > 0; j--) 
+  for (j = ilsum_i[nsupers_i]; j > 0; j--)
     asup_rowptr[j] = asup_rowptr[j-1];
   ainf_colptr[0] = 0;
   asup_rowptr[0] = 0;
-  
+
   SUPERLU_FREE(ia);
   SUPERLU_FREE(aij);
   memAux -= 2*szbuf*iword + szbuf*dword;
-  
+
   *p_ainf_colptr = ainf_colptr;
-  *p_ainf_rowind = ainf_rowind; 
+  *p_ainf_rowind = ainf_rowind;
   *p_ainf_val    = ainf_val;
   *p_asup_rowptr = asup_rowptr;
   *p_asup_colind = asup_colind;
@@ -1141,10 +1141,10 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
  * Purpose
  * =======
  *   Distribute the input matrix onto the 2D process mesh.
- * 
+ *
  * Arguments
  * =========
- * 
+ *
  * fact (input) fact_t
  *        Specifies whether or not the L and U structures will be re-used.
  *        = SamePattern_SameRowPerm: L and U structures are input, and
@@ -1167,7 +1167,7 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
  *
  * Glu_freeable (Input) *Glu_freeable_t
  *        The global structure describing the graph of L and U.
- * 
+ *
  * LUstruct (Input) LUstruct_t*
  *        Data structures for L and U factors.
  *
@@ -1186,22 +1186,22 @@ zdist_A(SuperMatrix *A, ScalePermstruct_
 float
 zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
 		ScalePermstruct_t *ScalePermstruct,
-		Pslu_freeable_t *Pslu_freeable, 
+		Pslu_freeable_t *Pslu_freeable,
 		LUstruct_t *LUstruct, gridinfo_t *grid)
 {
   Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
   Glu_freeable_t Glu_freeable_n;
   LocalLU_t *Llu = LUstruct->Llu;
-  int_t bnnz, fsupc, i, irow, istart, j, jb, ib, jj, k, k1, 
+  int_t bnnz, fsupc, i, irow, istart, j, jb, ib, jj, k, k1,
     len, len1, nsupc, nsupc_gb, ii, nprocs;
   int_t lib;  /* local block row number */
-  int_t nlb;  /* local block rows*/    
+  int_t nlb;  /* local block rows*/
   int_t ljb;  /* local block column number */
   int_t nrbl; /* number of L blocks in current block column */
   int_t nrbu; /* number of U blocks in current block column */
   int_t gb;   /* global block number; 0 < gb <= nsuper */
   int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-  int_t ub,gik,iklrow,fnz;   
+  int_t ub,gik,iklrow,fnz;
   int iam, jbrow, jbcol, jcol, kcol, krow, mycol, myrow, pc, pr, ljb_i, ljb_j, p;
   int_t mybufmax[NBUFFERS];
   NRformat_loc *Astore;
@@ -1221,45 +1221,45 @@ zdist_psymbtonum(fact_t fact, int_t n, S
   int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
   doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
   doublecomplex **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-  doublecomplex **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */  
+  doublecomplex **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
   int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-  int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */	 
-  int_t *index_srt;         /* indices consist of headers and row subscripts */	
-  doublecomplex *lusup_srt; /* nonzero values in L and U */    
+  int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+  int_t *index_srt;         /* indices consist of headers and row subscripts */
+  doublecomplex *lusup_srt; /* nonzero values in L and U */
   doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
   int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
   int_t  *Unnz;  /* size ceil(NSUPERS/Pc) */
-  
+
   BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
   RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
   BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-  RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */	
+  RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
   int msgsize;
 
   int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
   Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-  int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */  
- 
- 
+  int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+
+
   /*-- Counts to be used in factorization. --*/
   int  *ToRecv, *ToSendD, **ToSendR;
-  
+
   /*-- Counts to be used in lower triangular solve. --*/
   int_t  *fmod;          /* Modification count for L-solve.        */
   int_t  **fsendx_plist; /* Column process list to send down Xk.   */
   int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
   int_t  nfsendx = 0;    /* Number of Xk I will send               */
   int_t  kseen;
-  
+
   /*-- Counts to be used in upper triangular solve. --*/
   int_t  *bmod;          /* Modification count for U-solve.        */
   int_t  **bsendx_plist; /* Column process list to send down Xk.   */
   int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nbsendx = 0;    /* Number of Xk I will send               */  
-  int_t  *ilsum;         /* starting position of each supernode in 
-			    the full array (local)                 */  
-  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in 
-				the full array (local, block column wise) */  
+  int_t  nbsendx = 0;    /* Number of Xk I will send               */
+  int_t  *ilsum;         /* starting position of each supernode in
+			    the full array (local)                 */
+  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in
+				the full array (local, block column wise) */
   /*-- Auxiliary arrays; freed on return --*/
   int_t *Urb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
   int_t *LUb_length; /* L,U block length; size nsupers_ij */
@@ -1281,31 +1281,31 @@ doublecomplex *dense, *dense_col; /* SPA
   int_t iword, dword;
   float mem_use = 0.0;
   int_t *mod_bit;
-  int_t *frecv, *brecv, *lloc; 
-  double *SeedSTD_BC,*SeedSTD_RD;				 
+  int_t *frecv, *brecv, *lloc;
+  double *SeedSTD_BC,*SeedSTD_RD;
   int_t idx_indx,idx_lusup;
   int_t nbrow;
   int_t  ik, il, lk, rel, knsupc, idx_r;
-  int_t  lptr1_tmp, idx_i, idx_v,m, uu;	
+  int_t  lptr1_tmp, idx_i, idx_v,m, uu;
   int_t	nub;
 
   float memStrLU, memA,
         memDist = 0.; /* memory used for redistributing the data, which does
 		         not include the memory for the numerical values
                          of L and U (positive number)*/
-  float  memNLU = 0.; /* memory allocated for storing the numerical values of 
+  float  memNLU = 0.; /* memory allocated for storing the numerical values of
 		         L and U, that will be used in the numeric
                          factorization (positive number) */
-  float  memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/		
-  
+  float  memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
+
 #if ( PRNTlevel>=1 )
   int_t nLblocks = 0, nUblocks = 0;
 #endif
-#if ( PROFlevel>=1 ) 
+#if ( PROFlevel>=1 )
 	double t, t_u, t_l;
 	int_t u_blks;
 #endif
-  
+
   /* Initialization. */
   iam = grid->iam;
 #if ( DEBUGlevel>=1 )
@@ -1316,27 +1316,27 @@ doublecomplex *dense, *dense_col; /* SPA
   nprocs = grid->npcol * grid->nprow;
   for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
   Astore   = (NRformat_loc *) A->Store;
-  
+
   iword = sizeof(int_t);
   dword = sizeof(doublecomplex);
 
   if (fact == SamePattern_SameRowPerm) {
-    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");  
+    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");
   }
 
-  if ((memStrLU = 
-       dist_symbLU (n, Pslu_freeable, 
+  if ((memStrLU =
+       dist_symbLU (n, Pslu_freeable,
 		    Glu_persist, &xlsub, &lsub, &xusub, &usub,	grid)) > 0)
     return (memStrLU);
   memDist += (-memStrLU);
   xsup  = Glu_persist->xsup;    /* supernode and column mapping */
-  supno = Glu_persist->supno;   
+  supno = Glu_persist->supno;
   nsupers  = supno[n-1] + 1;
   nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */
   nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */
   nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j);
   if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) {
-    fprintf (stderr, "Malloc fails for ilsum[].");  
+    fprintf (stderr, "Malloc fails for ilsum[].");
     return (memDist + memNLU + memTRS);
   }
   memNLU += (nsupers_i+1) * iword;
@@ -1349,7 +1349,7 @@ doublecomplex *dense, *dense_col; /* SPA
   /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */
   ilsum[0] = 0;
   ldaspa = 0;
-  for (gb = 0; gb < nsupers; gb++) 
+  for (gb = 0; gb < nsupers; gb++)
     if ( myrow == PROW( gb, grid ) ) {
       i = SuperSize( gb );
       ldaspa += i;
@@ -1358,8 +1358,8 @@ doublecomplex *dense, *dense_col; /* SPA
     }
   ilsum[nsupers_i] = ldaspa;
 
-  ldaspa_j = 0; ilsum_j[0] = 0;  
-  for (gb = 0; gb < nsupers; gb++) 
+  ldaspa_j = 0; ilsum_j[0] = 0;
+  for (gb = 0; gb < nsupers; gb++)
     if (mycol == PCOL( gb, grid )) {
       i = SuperSize( gb );
       ldaspa_j += i;
@@ -1367,7 +1367,7 @@ doublecomplex *dense, *dense_col; /* SPA
       ilsum_j[lb + 1] = ilsum_j[lb] + i;
     }
   ilsum_j[nsupers_j] = ldaspa_j;
-  
+
   if ((memA = zdist_A(A, ScalePermstruct, Glu_persist,
 		      grid, &ainf_colptr, &ainf_rowind, &ainf_val,
 		      &asup_rowptr, &asup_colind, &asup_val,
@@ -1378,7 +1378,7 @@ doublecomplex *dense, *dense_col; /* SPA
   /* ------------------------------------------------------------
      FIRST TIME CREATING THE L AND U DATA STRUCTURES.
      ------------------------------------------------------------*/
-  
+
   /* We first need to set up the L and U data structures and then
    * propagate the values of A into them.
    */
@@ -1388,7 +1388,7 @@ doublecomplex *dense, *dense_col; /* SPA
   }
   for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
   memNLU += nsupers * iword;
-  
+
   k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
   if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) {
     fprintf(stderr, "Malloc fails for ToSendR[].");
@@ -1401,10 +1401,10 @@ doublecomplex *dense, *dense_col; /* SPA
     return (memDist + memNLU + memTRS);
   }
   memNLU += j*iword;
-  
+
   for (i = 0; i < j; ++i) index1[i] = EMPTY;
   for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
-  
+
   /* Auxiliary arrays used to set up L and U block data structures.
      They are freed on return. */
   if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
@@ -1418,16 +1418,16 @@ doublecomplex *dense, *dense_col; /* SPA
   if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) {
     fprintf(stderr, "Calloc fails for LUb_number[].");
     return (memDist + memNLU + memTRS);
-  }    
+  }
   if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) {
     fprintf(stderr, "Calloc fails for LUb_valptr[].");
     return (memDist + memNLU + memTRS);
   }
   memDist += 4 * nsupers_ij * iword;
-  
-  k = CEILING( nsupers, grid->nprow ); 
+
+  k = CEILING( nsupers, grid->nprow );
   /* Pointers to the beginning of each block row of U. */
-  if ( !(Unzval_br_ptr = 
+  if ( !(Unzval_br_ptr =
 	 (doublecomplex**)SUPERLU_MALLOC(nsupers_i * sizeof(doublecomplex*))) ) {
     fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
     return (memDist + memNLU + memTRS);
@@ -1446,7 +1446,7 @@ doublecomplex *dense, *dense_col; /* SPA
   }
   for (i = 0; i < nsupers_i; ++i) ToSendD[i] = NO;
 
-  memNLU += nsupers_i*iword;  
+  memNLU += nsupers_i*iword;
   if ( !(Urb_marker = intCalloc_dist(nsupers_j))) {
     fprintf(stderr, "Calloc fails for rb_marker[].");
     return (memDist + memNLU + memTRS);
@@ -1456,11 +1456,11 @@ doublecomplex *dense, *dense_col; /* SPA
     return (memDist + memNLU + memTRS);
   }
   memDist += (nsupers_i + nsupers_j)*iword;
-  
+
   /* Auxiliary arrays used to set up L, U block data structures.
      They are freed on return.
      k is the number of local row blocks.   */
-  if ( !(dense = doublecomplexCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) 
+  if ( !(dense = doublecomplexCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j)
 				   * sp_ienv_dist(3))) ) {
     fprintf(stderr, "Calloc fails for SPA dense[].");
     return (memDist + memNLU + memTRS);
@@ -1475,11 +1475,11 @@ doublecomplex *dense, *dense_col; /* SPA
     return (memDist + memNLU + memTRS);
   }
   /* ------------------------------------------------ */
-  memNLU += 2*nsupers_i*iword + 
-    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; 
-  
+  memNLU += 2*nsupers_i*iword +
+    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword;
+
   /* Pointers to the beginning of each block column of L. */
-  if ( !(Lnzval_bc_ptr = 
+  if ( !(Lnzval_bc_ptr =
 	 (doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) {
     fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[].");
     return (memDist + memNLU + memTRS);
@@ -1488,35 +1488,35 @@ doublecomplex *dense, *dense_col; /* SPA
     fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
     return (memDist + memNLU + memTRS);
   }
- 
-  if ( !(Linv_bc_ptr = 
+
+  if ( !(Linv_bc_ptr =
 			(doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) {
 	fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
 	return (memDist + memNLU + memTRS);
-  }  
-  if ( !(Uinv_bc_ptr = 
+  }
+  if ( !(Uinv_bc_ptr =
 			(doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) {
 	fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
 	return (memDist + memNLU + memTRS);
-  }   
+  }
   if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){
     fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[].");
     return (memDist + memNLU + memTRS);
-  }  
-  
+  }
+
   if ( !(Unnz = (int_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int_t))) ){
     fprintf(stderr, "Malloc fails for Unnz[].");
     return (memDist + memNLU + memTRS);
-  }    
-  memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double*) + nsupers_j*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr    
-  
+  }
+  memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double*) + nsupers_j*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
+
   memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*);
   Lnzval_bc_ptr[nsupers_j-1] = NULL;
   Lrowind_bc_ptr[nsupers_j-1] = NULL;
   Linv_bc_ptr[nsupers_j-1] = NULL;
   Uinv_bc_ptr[nsupers_j-1] = NULL;
-  Lindval_loc_bc_ptr[nsupers_j-1] = NULL;  
-  
+  Lindval_loc_bc_ptr[nsupers_j-1] = NULL;
+
   /* These lists of processes will be used for triangular solves. */
   if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
     fprintf(stderr, "Malloc fails for fsendx_plist[].");
@@ -1543,7 +1543,7 @@ doublecomplex *dense, *dense_col; /* SPA
     bsendx_plist[i] = &index[j];
   /* -------------------------------------------------------------- */
   memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
-  
+
   /*------------------------------------------------------------
     PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
     THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
@@ -1555,12 +1555,12 @@ doublecomplex *dense, *dense_col; /* SPA
     ljb_i = LBi( jb, grid);  /* Local block number row wise */
     fsupc = FstBlockC( jb );
     nsupc = SuperSize( jb );
-    
+
     if ( myrow == jbrow ) { /* Block row jb in my process row */
       /* Scatter A into SPA. */
       for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
 	for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
-	  if (i >= asup_rowptr[ilsum[nsupers_i]]) 
+	  if (i >= asup_rowptr[ilsum[nsupers_i]])
 	    printf ("ERR7\n");
 	  jcol = asup_colind[i];
 	  if (jcol >= n)
@@ -1577,7 +1577,7 @@ doublecomplex *dense, *dense_col; /* SPA
 	}
 	dense_col += ldaspa_j;
       }
-      
+
       /*------------------------------------------------
        * SET UP U BLOCKS.
        *------------------------------------------------*/
@@ -1589,18 +1589,18 @@ doublecomplex *dense, *dense_col; /* SPA
 	if (i >= xusub[nsupers_i]) printf ("ERR10\n");
 	jcol = usub[i];
 	gb = BlockNum( jcol ); /* Global block number */
-	
+
 	/*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986)
 	  printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n",
 	  iam, jb, gb, jcol, jbcol, pc); */
-	
+
 	lb = LBj( gb, grid );  /* Local block number */
 	pc = PCOL( gb, grid ); /* Process col owning this block */
 	if (mycol == jbcol) ToSendR[ljb_j][pc] = YES;
 	/* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */
 	pr = PROW( gb, grid );
 	if ( pr != jbrow  && mycol == pc)
-	  bsendx_plist[lb][jbrow] = YES; 
+	  bsendx_plist[lb][jbrow] = YES;
 	if (mycol == pc) {
 	  len += nsupc;
 	  LUb_length[lb] += nsupc;
@@ -1622,8 +1622,8 @@ doublecomplex *dense, *dense_col; /* SPA
 	  }
 	}
       } /* for i ... */
-      
-      if ( nrbu ) { 
+
+      if ( nrbu ) {
 	/* Sort the blocks of U in increasing block column index.
 	   SuperLU_DIST assumes this is true */
 	/* simple insert sort algorithm */
@@ -1634,8 +1634,8 @@ doublecomplex *dense, *dense_col; /* SPA
 	    LUb_number[i+1] = LUb_number[i];
 	  }
 	  LUb_number[i+1] = k;
-	} 
-	
+	}
+
 	/* Set up the initial pointers for each block in
 	   index[] and nzval[]. */
 	/* Add room for descriptors */
@@ -1678,17 +1678,17 @@ doublecomplex *dense, *dense_col; /* SPA
 	for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
 	  jcol = usub[i];
 	  gb = BlockNum( jcol );
-	  
+
 	  if ( mycol == PCOL( gb, grid ) ) {
 	    lb = LBj( gb, grid );
 	    k = LUb_indptr[lb]; /* Start fstnz in index */
 	    index[k + jcol - FstBlockC( gb )] = FstBlockC( jb );
 	  }
 	}  /* for i ... */
-	
+
 	for (i = 0; i < nrbu; i++) {
 	  gb = LUb_number[i];
-	  lb = LBj( gb, grid );   
+	  lb = LBj( gb, grid );
 	  next_ind = LUb_indptr[lb];
 	  k = FstBlockC( jb + 1);
 	  jcol = ilsum_j[lb];
@@ -1698,16 +1698,16 @@ doublecomplex *dense, *dense_col; /* SPA
 	    for (ii = j; ii < k; ii++) {
 	      uval[LUb_valptr[lb]++] = dense_col[jcol];
 	      dense_col[jcol] = zero;
-	      dense_col += ldaspa_j;	      
+	      dense_col += ldaspa_j;
 	    }
 	  }
 	}
       } else {
 	Ufstnz_br_ptr[ljb_i] = NULL;
 	Unzval_br_ptr[ljb_i] = NULL;
-      } /* if nrbu ... */	
+      } /* if nrbu ... */
     } /* if myrow == jbrow */
-    
+
       /*------------------------------------------------
        * SET UP L BLOCKS.
        *------------------------------------------------*/
@@ -1727,8 +1727,8 @@ doublecomplex *dense, *dense_col; /* SPA
 	  }
 	}
 	dense_col += ldaspa;
-      }      
-      
+      }
+
       /* sort the indices of the diagonal block at the beginning of xlsub */
       if (myrow == jbrow) {
 	k = xlsub[ljb_j];
@@ -1741,14 +1741,14 @@ doublecomplex *dense, *dense_col; /* SPA
 	  }
 	}
       }
-      
+
       /* Count number of blocks and length of each block. */
       nrbl = 0;
       len = 0; /* Number of row subscripts I own. */
       kseen = 0;
       for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
 	irow = lsub[i];
-	gb = BlockNum( irow ); /* Global block number */	  
+	gb = BlockNum( irow ); /* Global block number */
 	pr = PROW( gb, grid ); /* Process row owning this block */
 	if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY &&
 	     myrow == jbrow) {
@@ -1770,14 +1770,14 @@ doublecomplex *dense, *dense_col; /* SPA
 #if ( PRNTlevel>=1 )
 	    ++nLblocks;
 #endif
-	  } else 
-	    ++LUb_length[lb];	    
+	  } else
+	    ++LUb_length[lb];
 	  ++len;
 	}
       } /* for i ... */
-      
+
       if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-	/* Set up the initial pointers for each block in 
+	/* Set up the initial pointers for each block in
 	   index[] and nzval[]. */
 	/* If I am the owner of the diagonal block, order it first in LUb_number.
 	   Necessary for SuperLU_DIST routines */
@@ -1790,7 +1790,7 @@ doublecomplex *dense, *dense_col; /* SPA
 	  LUb_number[kseen] = LUb_number[0];
 	  LUb_number[0] = jb;
 	}
-	
+
 	/* Add room for descriptors */
 	len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
 	if ( !(index = intMalloc_dist(len1)) ) {
@@ -1798,23 +1798,23 @@ doublecomplex *dense, *dense_col; /* SPA
 	  return (memDist + memNLU + memTRS);
 	}
 	Lrowind_bc_ptr[ljb_j] = index;
-	if (!(Lnzval_bc_ptr[ljb_j] = 
+	if (!(Lnzval_bc_ptr[ljb_j] =
 	      doublecomplexMalloc_dist(len*nsupc))) {
 	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb);
 	  return (memDist + memNLU + memTRS);
 	}
-	
+
 	if (!(Linv_bc_ptr[ljb_j] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
 		ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]");
 	if (!(Uinv_bc_ptr[ljb_j] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
-		ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]");	
-	
+		ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]");
+
 	memNLU += len1*iword + len*nsupc*dword;
 
-	if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(nrbl*3))) 
+	if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(nrbl*3)))
 		ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]");
-	memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]	
-	
+	memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
+
 	lusup = Lnzval_bc_ptr[ljb_j];
 	mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
 	mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
@@ -1827,14 +1827,14 @@ doublecomplex *dense, *dense_col; /* SPA
 	  gb = LUb_number[k];
 	  lb = LBi( gb, grid );
 	  len = LUb_length[lb];
-	  
+
 	  Lindval_loc_bc_ptr[ljb_j][k] = lb;
 	  Lindval_loc_bc_ptr[ljb_j][k+nrbl] = next_ind;
-	  Lindval_loc_bc_ptr[ljb_j][k+nrbl*2] = next_val;			  
-	 	  
+	  Lindval_loc_bc_ptr[ljb_j][k+nrbl*2] = next_val;
+
 	  LUb_length[lb] = 0;
 	  index[next_ind++] = gb; /* Descriptor */
-	  index[next_ind++] = len; 
+	  index[next_ind++] = len;
 	  LUb_indptr[lb] = next_ind;
 	    LUb_valptr[lb] = next_val;
 	    next_ind += len;
@@ -1860,8 +1860,8 @@ doublecomplex *dense, *dense_col; /* SPA
 	      }
 	    }
 	  } /* for i ... */
-	  
-	  
+
+
 
 		/* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/
 		if(nrbl>1){
@@ -1870,18 +1870,18 @@ doublecomplex *dense, *dense_col; /* SPA
 				uu=nrbl-2;
 				lloc = &Lindval_loc_bc_ptr[ljb_j][1];
 			}else{
-				uu=nrbl-1;	
+				uu=nrbl-1;
 				lloc = Lindval_loc_bc_ptr[ljb_j];
-			}	
-			quickSortM(lloc,0,uu,nrbl,0,3);	
+			}
+			quickSortM(lloc,0,uu,nrbl,0,3);
 		}
 
 
-		if ( !(index_srt = intMalloc_dist(len1)) ) 
-			ABORT("Malloc fails for index_srt[]");				
+		if ( !(index_srt = intMalloc_dist(len1)) )
+			ABORT("Malloc fails for index_srt[]");
 		if (!(lusup_srt = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex))))
 			ABORT("Malloc fails for lusup_srt[]");
-				
+
 		idx_indx = BC_HEADER;
 		idx_lusup = 0;
 		for (jj=0;jj<BC_HEADER;jj++)
@@ -1893,33 +1893,33 @@ doublecomplex *dense, *dense_col; /* SPA
 				index_srt[idx_indx++] = index[Lindval_loc_bc_ptr[ljb_j][i+nrbl]+jj];
 			}
 
-			Lindval_loc_bc_ptr[ljb_j][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow; 
+			Lindval_loc_bc_ptr[ljb_j][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow;
 
 			for (jj=0;jj<nbrow;jj++){
 				k=idx_lusup;
 				k1=Lindval_loc_bc_ptr[ljb_j][i+nrbl*2]+jj;
-				for (j = 0; j < nsupc; ++j) {				
+				for (j = 0; j < nsupc; ++j) {
 					lusup_srt[k] = lusup[k1];
 					k += len;
 					k1 += len;
-				}	
+				}
 				idx_lusup++;
-			}				
-			Lindval_loc_bc_ptr[ljb_j][i+nrbl*2] = idx_lusup - nbrow;	
+			}
+			Lindval_loc_bc_ptr[ljb_j][i+nrbl*2] = idx_lusup - nbrow;
 		}
 
 		SUPERLU_FREE(lusup);
 		SUPERLU_FREE(index);
 
 		Lrowind_bc_ptr[ljb_j] = index_srt;
-		Lnzval_bc_ptr[ljb_j] = lusup_srt; 			
+		Lnzval_bc_ptr[ljb_j] = lusup_srt;
 	} else {
 	  Lrowind_bc_ptr[ljb_j] = NULL;
 	  Lnzval_bc_ptr[ljb_j] = NULL;
 	  Linv_bc_ptr[ljb_j] = NULL;
 	  Uinv_bc_ptr[ljb_j] = NULL;
-	  Lindval_loc_bc_ptr[ljb_j] = NULL;	  
-	} /* if nrbl ... */		  
+	  Lindval_loc_bc_ptr[ljb_j] = NULL;
+	} /* if nrbl ... */
       } /* if mycol == pc */
   } /* for jb ... */
 
@@ -1931,7 +1931,7 @@ doublecomplex *dense, *dense_col; /* SPA
   SUPERLU_FREE(LUb_valptr);
   SUPERLU_FREE(Lrb_marker);
   SUPERLU_FREE(dense);
-  
+
   /* Free the memory used for storing A */
   SUPERLU_FREE(ainf_colptr);
   if (ainf_rowind != NULL) {
@@ -1940,10 +1940,10 @@ doublecomplex *dense, *dense_col; /* SPA
   }
   SUPERLU_FREE(asup_rowptr);
   if (asup_colind != NULL) {
-    SUPERLU_FREE(asup_colind);	
-    SUPERLU_FREE(asup_val);	
+    SUPERLU_FREE(asup_colind);
+    SUPERLU_FREE(asup_val);
   }
-  
+
   /* exchange information about bsendx_plist in between column of processors */
   k = SUPERLU_MAX( grid->nprow, grid->npcol);
   if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
@@ -1966,19 +1966,19 @@ doublecomplex *dense, *dense_col; /* SPA
     fprintf (stderr, "Malloc fails for ptrToRecv[].");
     return (memDist + memNLU + memTRS);
   }
-  
+
   if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int)))
     memDist = nsupers*k*iword +4*nprocs * sizeof(int);
-  
+
   for (p = 0; p < nprocs; p++)
     nnzToRecv[p] = 0;
-  
+
   for (jb = 0; jb < nsupers; jb++) {
     jbcol = PCOL( jb, grid );
     jbrow = PROW( jb, grid );
     p = PNUM(jbrow, jbcol, grid);
     nnzToRecv[p] += grid->npcol;
-  }    
+  }
   i = 0;
   for (p = 0; p < nprocs; p++) {
     ptrToRecv[p] = i;
@@ -1996,21 +1996,21 @@ doublecomplex *dense, *dense_col; /* SPA
     jbrow = PROW( jb, grid );
     p = PNUM(jbrow, jbcol, grid);
     if (p == iam) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
+      ljb_j = LBj( jb, grid ); /* Local block number column wise */
       for (j = 0; j < grid->npcol; j++, i++)
 	recvBuf[i] = ToSendR[ljb_j][j];
     }
-  }   
-  
+  }
+
   MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
 		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
-  
+
   for (jb = 0; jb < nsupers; jb++) {
     jbcol = PCOL( jb, grid );
     jbrow = PROW( jb, grid );
     p = PNUM(jbrow, jbcol, grid);
-    ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-    ljb_i = LBi( jb, grid ); /* Local block number row wise */	
+    ljb_j = LBj( jb, grid ); /* Local block number column wise */
+    ljb_i = LBi( jb, grid ); /* Local block number row wise */
     /* (myrow == jbrow) {
        if (ToSendD[ljb_i] == YES)
        ToRecv[jb] = 1;
@@ -2026,22 +2026,22 @@ doublecomplex *dense, *dense_col; /* SPA
 	ToRecv[jb] = 2;
     }
     if (mycol == jbcol) {
-      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) 
-	ToSendR[ljb_j][i] = recvBuf[j];  
+      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++)
+	ToSendR[ljb_j][i] = recvBuf[j];
       ToSendR[ljb_j][mycol] = EMPTY;
     }
     ptrToRecv[p] += grid->npcol;
-  }   
-  
+  }
+
   /* exchange information about bsendx_plist in between column of processors */
   MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
 		 MPI_MAX, grid->cscp.comm);
-  
+
   for (jb = 0; jb < nsupers; jb ++) {
     jbcol = PCOL( jb, grid);
     jbrow = PROW( jb, grid);
     if (mycol == jbcol) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
+      ljb_j = LBj( jb, grid ); /* Local block number column wise */
       if (myrow == jbrow ) {
 	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) {
 	  (*bsendx_plist)[k] = recvBuf[k];
@@ -2050,14 +2050,14 @@ doublecomplex *dense, *dense_col; /* SPA
 	}
       }
       else {
-	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) 
+	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++)
 	  (*bsendx_plist)[k] = EMPTY;
       }
     }
   }
 
 		/////////////////////////////////////////////////////////////////
-		
+
 		/* Set up additional pointers for the index and value arrays of U.
 		   nub is the number of local block columns. */
 		nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */
@@ -2071,7 +2071,7 @@ doublecomplex *dense, *dense_col; /* SPA
 			ABORT("Malloc fails for Ucb_valptr[]");
 		nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
 
-		/* Count number of row blocks in a block column. 
+		/* Count number of row blocks in a block column.
 		   One pass of the skeleton graph of U. */
 		for (lk = 0; lk < nlb; ++lk) {
 			usub1 = Ufstnz_br_ptr[lk];
@@ -2110,21 +2110,21 @@ doublecomplex *dense, *dense_col; /* SPA
 
 					Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
 					Ucb_valptr[ljb][Urbs1[ljb]] = j;
-					
+
 					++Urbs1[ljb];
 					j += usub1[i+1];
 					i += UB_DESCRIPTOR + SuperSize( k );
 				}
 			}
-		}			
-		
-		
+		}
+
 
-/* Count the nnzs per block column */	
+
+/* Count the nnzs per block column */
 	for (lb = 0; lb < nub; ++lb) {
 		Unnz[lb] = 0;
 		k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
-		knsupc = SuperSize( k );	
+		knsupc = SuperSize( k );
 		for (ub = 0; ub < Urbs[lb]; ++ub) {
 			ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */
 			i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */
@@ -2138,47 +2138,47 @@ doublecomplex *dense, *dense_col; /* SPA
 				}
 			} /* for jj ... */
 		}
-	}						
-		
+	}
+
 		/////////////////////////////////////////////////////////////////
 
 		// if(LSUM<nsupers)ABORT("Need increase LSUM."); /* temporary*/
 
 #if ( PROFlevel>=1 )
 			t = SuperLU_timer_();
-#endif				
+#endif
 		/* construct the Bcast tree for L ... */
 
 		k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
 		if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 			ABORT("Malloc fails for LBtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-			ABORT("Calloc fails for ActiveFlag[].");	
+			ABORT("Calloc fails for ActiveFlag[].");
 		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-			ABORT("Malloc fails for ranks[].");	
+			ABORT("Malloc fails for ranks[].");
 		if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-			ABORT("Malloc fails for SeedSTD_BC[].");	
+			ABORT("Malloc fails for SeedSTD_BC[].");
 
 		for (i=0;i<k;i++){
-			SeedSTD_BC[i]=rand();		
+			SeedSTD_BC[i]=rand();
 		}
 
-		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 		for (ljb = 0; ljb <k ; ++ljb) {
 			LBtree_ptr[ljb]=NULL;
-		}			
-		
+		}
+
 
 		if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-			ABORT("Calloc fails for ActiveFlag[].");				
-		for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;	
-		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll		
+			ABORT("Calloc fails for ActiveFlag[].");
+		for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;
+		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
 		for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 			jb = mycol+ljb*grid->npcol;  /* not sure */
 			if(jb<nsupers){
 			pc = PCOL( jb, grid );
-			
+
 			istart = xlsub[ljb];
 			for (i = istart; i < xlsub[ljb+1]; ++i) {
 				irow = lsub[i];
@@ -2187,15 +2187,15 @@ doublecomplex *dense, *dense_col; /* SPA
 				ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb);
 			} /* for j ... */
 			}
-		}			
+		}
+
+
+		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm);
+
+
 
-		
-		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm);					  
-		
-		
-		
 		for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
-			
+
 			jb = mycol+ljb*grid->npcol;  /* not sure */
 			if(jb<nsupers){
 			pc = PCOL( jb, grid );
@@ -2204,19 +2204,19 @@ doublecomplex *dense, *dense_col; /* SPA
 			for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 			for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 			for (j=0;j<grid->nprow;++j){
 				if(ActiveFlag[j]!=3*nsupers){
 				gb = ActiveFlag[j];
 				pr = PROW( gb, grid );
 				if(gb==jb)Root=pr;
-				if(myrow==pr)Iactive=1;		
-				}					
+				if(myrow==pr)Iactive=1;
+				}
 			}
-			
 
-			quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);	
+
+			quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);
 
 			if(Iactive==1){
 				// printf("jb %5d damn\n",jb);
@@ -2229,7 +2229,7 @@ doublecomplex *dense, *dense_col; /* SPA
 						ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 						++rank_cnt;
 					}
-				}		
+				}
 
 				if(rank_cnt>1){
 
@@ -2239,7 +2239,7 @@ doublecomplex *dense, *dense_col; /* SPA
 					// rseed=rand();
 					// rseed=1.0;
 					msgsize = SuperSize( jb );
-					LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');  	
+					LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
 					BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z');
 
 					// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
@@ -2250,15 +2250,15 @@ doublecomplex *dense, *dense_col; /* SPA
 					// fflush(stdout);
 					// }
 
-					// #if ( PRNTlevel>=1 )		
+					// #if ( PRNTlevel>=1 )
 					if(Root==myrow){
 						rank_cnt_ref=1;
 						for (j = 0; j < grid->nprow; ++j) {
-							if ( fsendx_plist[ljb][j] != EMPTY ) {	
-								++rank_cnt_ref;		
+							if ( fsendx_plist[ljb][j] != EMPTY ) {
+								++rank_cnt_ref;
 							}
 						}
-						assert(rank_cnt==rank_cnt_ref);		
+						assert(rank_cnt==rank_cnt_ref);
 
 						// printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt);
 
@@ -2267,27 +2267,27 @@ doublecomplex *dense, *dense_col; /* SPA
 						// // printf("\n");
 					}
 					// #endif
-				}	
+				}
 			}
 			}
 		}
 
-		
+
 		SUPERLU_FREE(ActiveFlag);
 		SUPERLU_FREE(ActiveFlagAll);
 		SUPERLU_FREE(ranks);
 		SUPERLU_FREE(SeedSTD_BC);
-		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll		
-		
+		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
+
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
-#endif			
+#endif
 
 
 #if ( PROFlevel>=1 )
 			t = SuperLU_timer_();
-#endif			
+#endif
 		/* construct the Reduce tree for L ... */
 		/* the following is used as reference */
 		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -2316,37 +2316,37 @@ doublecomplex *dense, *dense_col; /* SPA
 		if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 			ABORT("Malloc fails for LRtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-			ABORT("Calloc fails for ActiveFlag[].");	
+			ABORT("Calloc fails for ActiveFlag[].");
 		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-			ABORT("Malloc fails for ranks[].");	
+			ABORT("Malloc fails for ranks[].");
 
 		// if ( !(idxs = intCalloc_dist(nsupers)) )
-			// ABORT("Calloc fails for idxs[].");	
+			// ABORT("Calloc fails for idxs[].");
 
 		// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 			// ABORT("Malloc fails for nzrows[].");
 
 		if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-			ABORT("Malloc fails for SeedSTD_RD[].");	
+			ABORT("Malloc fails for SeedSTD_RD[].");
 
 		for (i=0;i<k;i++){
-			SeedSTD_RD[i]=rand();		
+			SeedSTD_RD[i]=rand();
 		}
 
-		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 
 		for (lib = 0; lib <k ; ++lib) {
 			LRtree_ptr[lib]=NULL;
 		}
 
-		
+
 		if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-			ABORT("Calloc fails for ActiveFlagAll[].");				
-		for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll					
-			
-			
+			ABORT("Calloc fails for ActiveFlagAll[].");
+		for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
+		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
+
+
 		for (ljb = 0; ljb < CEILING( nsupers, grid->npcol); ++ljb) { /* for each local block column ... */
 			jb = mycol+ljb*grid->npcol;  /* not sure */
 			if(jb<nsupers){
@@ -2364,7 +2364,7 @@ doublecomplex *dense, *dense_col; /* SPA
 		}
 
 		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MAX,grid->rscp.comm);
-		
+
 		for (lib=0;lib<k;++lib){
 			ib = myrow+lib*grid->nprow;  /* not sure */
 			if(ib<nsupers){
@@ -2372,19 +2372,19 @@ doublecomplex *dense, *dense_col; /* SPA
 				for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 				for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 				for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-				Root=-1; 
-				Iactive = 0;				
+				Root=-1;
+				Iactive = 0;
 
 				for (j=0;j<grid->npcol;++j){
 					if(ActiveFlag[j]!=-3*nsupers){
 					jb = ActiveFlag[j];
 					pc = PCOL( jb, grid );
 					if(jb==ib)Root=pc;
-					if(mycol==pc)Iactive=1;		
-					}					
+					if(mycol==pc)Iactive=1;
+					}
 				}
-			
-			
+
+
 				quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2);
 
 				if(Iactive==1){
@@ -2400,7 +2400,7 @@ doublecomplex *dense, *dense_col; /* SPA
 					if(rank_cnt>1){
 
 						for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-							ranks[ii] = PNUM( pr, ranks[ii], grid );		
+							ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 						// rseed=rand();
 						// rseed=1.0;
@@ -2408,7 +2408,7 @@ doublecomplex *dense, *dense_col; /* SPA
 
 						// if(ib==0){
 
-						LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');  	
+						LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
 						RdTree_SetTag(LRtree_ptr[lib], RD_L,'z');
 						// }
 
@@ -2424,10 +2424,10 @@ doublecomplex *dense, *dense_col; /* SPA
 						// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 						// printf("\n");
 						}
-						#endif		
+						#endif
 					}
-				}				
-			}	
+				}
+			}
 		}
 
 		SUPERLU_FREE(mod_bit);
@@ -2436,9 +2436,9 @@ doublecomplex *dense, *dense_col; /* SPA
 
 		SUPERLU_FREE(ActiveFlag);
 		SUPERLU_FREE(ActiveFlagAll);
-		SUPERLU_FREE(ranks);	
-		// SUPERLU_FREE(idxs);	 
-		SUPERLU_FREE(SeedSTD_RD);	
+		SUPERLU_FREE(ranks);
+		// SUPERLU_FREE(idxs);
+		SUPERLU_FREE(SeedSTD_RD);
 		// for(i=0;i<nsupers;++i){
 			// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 		// }
@@ -2449,11 +2449,11 @@ doublecomplex *dense, *dense_col; /* SPA
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif	
+#endif
 
 		/* construct the Bcast tree for U ... */
 
@@ -2461,35 +2461,35 @@ doublecomplex *dense, *dense_col; /* SPA
 		if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 			ABORT("Malloc fails for UBtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-			ABORT("Calloc fails for ActiveFlag[].");	
+			ABORT("Calloc fails for ActiveFlag[].");
 		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-			ABORT("Malloc fails for ranks[].");	
+			ABORT("Malloc fails for ranks[].");
 		if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-			ABORT("Malloc fails for SeedSTD_BC[].");	
+			ABORT("Malloc fails for SeedSTD_BC[].");
 
 		for (i=0;i<k;i++){
-			SeedSTD_BC[i]=rand();		
+			SeedSTD_BC[i]=rand();
 		}
 
-		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 
 		for (ljb = 0; ljb <k ; ++ljb) {
 			UBtree_ptr[ljb]=NULL;
-		}	
+		}
 
 		if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-			ABORT("Calloc fails for ActiveFlagAll[].");				
-		for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll				
-		
-		
+			ABORT("Calloc fails for ActiveFlagAll[].");
+		for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
+		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
+
+
 		for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */
 			ib = myrow+lib*grid->nprow;  /* not sure */
-			
+
 		// if(ib==0)printf("iam %5d ib %5d\n",iam,ib);
-		// fflush(stdout);				
-			
+		// fflush(stdout);
+
 			if(ib<nsupers){
 				for (i = xusub[lib]; i < xusub[lib+1]; i++) {
 				  jcol = usub[i];
@@ -2497,26 +2497,26 @@ doublecomplex *dense, *dense_col; /* SPA
 				  ljb = LBj( jb, grid );    /* local block number */
 				  pc = PCOL( jb, grid );
 				  pr = PROW( ib, grid );
-				  if ( mycol == pc ) { /* Block column ib in my process column */		
-					ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib);			  
+				  if ( mycol == pc ) { /* Block column ib in my process column */
+					ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib);
 				  }
 				}  /* for i ... */
 				pr = PROW( ib, grid ); // take care of diagonal node stored as L
 				pc = PCOL( ib, grid );
-				if ( mycol == pc ) { /* Block column ib in my process column */					
+				if ( mycol == pc ) { /* Block column ib in my process column */
 					ljb = LBj( ib, grid );    /* local block number */
-					ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib);					
+					ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib);
 					// if(pr+ljb*grid->nprow==0)printf("iam %5d ib %5d ActiveFlagAll %5d pr %5d ljb %5d\n",iam,ib,ActiveFlagAll[pr+ljb*grid->nprow],pr,ljb);
-					// fflush(stdout);	
-				}					
-			}	
+					// fflush(stdout);
+				}
+			}
 		}
-		
+
 		// printf("iam %5d ActiveFlagAll %5d\n",iam,ActiveFlagAll[0]);
 		// fflush(stdout);
-		
-		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm);					  
-					
+
+		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm);
+
 		for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */
 			jb = mycol+ljb*grid->npcol;  /* not sure */
 			if(jb<nsupers){
@@ -2527,18 +2527,18 @@ doublecomplex *dense, *dense_col; /* SPA
 			for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 			for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 			for (j=0;j<grid->nprow;++j){
 				if(ActiveFlag[j]!=-3*nsupers){
 				gb = ActiveFlag[j];
 				pr = PROW( gb, grid );
 				if(gb==jb)Root=pr;
-				if(myrow==pr)Iactive=1;		
+				if(myrow==pr)Iactive=1;
 				}
-			}						
-			
-			quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);	
+			}
+
+			quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);
 		// printf("jb: %5d Iactive %5d\n",jb,Iactive);
 		// fflush(stdout);
 			if(Iactive==1){
@@ -2552,7 +2552,7 @@ doublecomplex *dense, *dense_col; /* SPA
 						ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 						++rank_cnt;
 					}
-				}		
+				}
 		// printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt);
 		// fflush(stdout);
 				if(rank_cnt>1){
@@ -2562,43 +2562,43 @@ doublecomplex *dense, *dense_col; /* SPA
 					// rseed=rand();
 					// rseed=1.0;
 					msgsize = SuperSize( jb );
-					UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');  	
+					UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
 					BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z');
 
 					// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 					// fflush(stdout);
-					
+
 					if(Root==myrow){
 					rank_cnt_ref=1;
 					for (j = 0; j < grid->nprow; ++j) {
 						// printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow);
 						// fflush(stdout);
-						if ( bsendx_plist[ljb][j] != EMPTY ) {	
-							++rank_cnt_ref;		
+						if ( bsendx_plist[ljb][j] != EMPTY ) {
+							++rank_cnt_ref;
 						}
 					}
 					// printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref);
-					// fflush(stdout);								
-					assert(rank_cnt==rank_cnt_ref);		
-					}						
+					// fflush(stdout);
+					assert(rank_cnt==rank_cnt_ref);
+					}
 				}
 			}
 			}
-		}	
+		}
 		SUPERLU_FREE(ActiveFlag);
 		SUPERLU_FREE(ActiveFlagAll);
-		SUPERLU_FREE(ranks);				
-		SUPERLU_FREE(SeedSTD_BC);				
-		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll		
-			
+		SUPERLU_FREE(ranks);
+		SUPERLU_FREE(SeedSTD_BC);
+		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
+
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 			t = SuperLU_timer_();
-#endif					
+#endif
 		/* construct the Reduce tree for U ... */
 		/* the following is used as reference */
 		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -2627,35 +2627,35 @@ doublecomplex *dense, *dense_col; /* SPA
 		if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 			ABORT("Malloc fails for URtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-			ABORT("Calloc fails for ActiveFlag[].");	
+			ABORT("Calloc fails for ActiveFlag[].");
 		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-			ABORT("Malloc fails for ranks[].");	
+			ABORT("Malloc fails for ranks[].");
 
 		// if ( !(idxs = intCalloc_dist(nsupers)) )
-			// ABORT("Calloc fails for idxs[].");	
+			// ABORT("Calloc fails for idxs[].");
 
 		// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 			// ABORT("Malloc fails for nzrows[].");
 
 		if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-			ABORT("Malloc fails for SeedSTD_RD[].");	
+			ABORT("Malloc fails for SeedSTD_RD[].");
 
 		for (i=0;i<k;i++){
-			SeedSTD_RD[i]=rand();		
+			SeedSTD_RD[i]=rand();
 		}
 
-		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 		for (lib = 0; lib <k ; ++lib) {
 			URtree_ptr[lib]=NULL;
 		}
 
-		
+
 		if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-			ABORT("Calloc fails for ActiveFlagAll[].");				
-		for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;	
-		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll				
-				
+			ABORT("Calloc fails for ActiveFlagAll[].");
+		for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;
+		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
+
 		for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */
 			ib = myrow+lib*grid->nprow;  /* not sure */
 			if(ib<nsupers){
@@ -2663,19 +2663,19 @@ doublecomplex *dense, *dense_col; /* SPA
 				  jcol = usub[i];
 				  jb = BlockNum( jcol );
 				  pc = PCOL( jb, grid );
-				  if ( mycol == pc ) { /* Block column ib in my process column */	
-					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);			  
-				  }	
+				  if ( mycol == pc ) { /* Block column ib in my process column */
+					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
+				  }
 				}  /* for i ... */
 				pc = PCOL( ib, grid );
-				if ( mycol == pc ) { /* Block column ib in my process column */						
+				if ( mycol == pc ) { /* Block column ib in my process column */
 					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],ib);
-				}						
-			}	
+				}
+			}
 		}
-		
-		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm);	
-		
+
+		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm);
+
 		for (lib=0;lib<k;++lib){
 			ib = myrow+lib*grid->nprow;  /* not sure */
 			if(ib<nsupers){
@@ -2683,18 +2683,18 @@ doublecomplex *dense, *dense_col; /* SPA
 				for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 				for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 				for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-				Root=-1; 
-				Iactive = 0;				
+				Root=-1;
+				Iactive = 0;
 
 				for (j=0;j<grid->npcol;++j){
 					if(ActiveFlag[j]!=3*nsupers){
 					jb = ActiveFlag[j];
 					pc = PCOL( jb, grid );
 					if(jb==ib)Root=pc;
-					if(mycol==pc)Iactive=1;		
-					}					
+					if(mycol==pc)Iactive=1;
+					}
 				}
-				
+
 				quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2);
 
 				if(Iactive==1){
@@ -2710,7 +2710,7 @@ doublecomplex *dense, *dense_col; /* SPA
 					if(rank_cnt>1){
 
 						for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-							ranks[ii] = PNUM( pr, ranks[ii], grid );		
+							ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 						// rseed=rand();
 						// rseed=1.0;
@@ -2718,7 +2718,7 @@ doublecomplex *dense, *dense_col; /* SPA
 
 						// if(ib==0){
 
-						URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');  	
+						URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
 						RdTree_SetTag(URtree_ptr[lib], RD_U,'z');
 						// }
 
@@ -2732,10 +2732,10 @@ doublecomplex *dense, *dense_col; /* SPA
 						// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 						// printf("\n");
 						}
-						// #endif		
+						// #endif
 					}
 				}
-			}						
+			}
 		}
 
 		SUPERLU_FREE(mod_bit);
@@ -2744,44 +2744,44 @@ doublecomplex *dense, *dense_col; /* SPA
 
 		SUPERLU_FREE(ActiveFlag);
 		SUPERLU_FREE(ActiveFlagAll);
-		SUPERLU_FREE(ranks);	
-		// SUPERLU_FREE(idxs);	
-		SUPERLU_FREE(SeedSTD_RD);	
+		SUPERLU_FREE(ranks);
+		// SUPERLU_FREE(idxs);
+		SUPERLU_FREE(SeedSTD_RD);
 		// for(i=0;i<nsupers;++i){
 			// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 		// }
-		// SUPERLU_FREE(nzrows);				
-		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll		
-			
+		// SUPERLU_FREE(nzrows);
+		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll
+
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_() - t;
 	if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
-#endif						
-			
+#endif
+
 	////////////////////////////////////////////////////////
- 
+
   /* Free the memory used for storing L and U */
   SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
   if (lsub != NULL)
-    SUPERLU_FREE(lsub);  
+    SUPERLU_FREE(lsub);
   if (usub != NULL)
-    SUPERLU_FREE(usub);  
-  
-  
+    SUPERLU_FREE(usub);
+
+
   SUPERLU_FREE(nnzToRecv);
   SUPERLU_FREE(ptrToRecv);
   SUPERLU_FREE(nnzToSend);
   SUPERLU_FREE(ptrToSend);
   SUPERLU_FREE(recvBuf);
-  
+
   Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-  Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;  
+  Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
   Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
   Llu->Linv_bc_ptr = Linv_bc_ptr;
-  Llu->Uinv_bc_ptr = Uinv_bc_ptr;  
+  Llu->Uinv_bc_ptr = Uinv_bc_ptr;
   Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
   Llu->Unzval_br_ptr = Unzval_br_ptr;
-  Llu->Unnz = Unnz;  
+  Llu->Unnz = Unnz;
   Llu->ToRecv = ToRecv;
   Llu->ToSendD = ToSendD;
   Llu->ToSendR = ToSendR;
@@ -2800,23 +2800,23 @@ doublecomplex *dense, *dense_col; /* SPA
   Llu->LBtree_ptr = LBtree_ptr;
   Llu->URtree_ptr = URtree_ptr;
   Llu->UBtree_ptr = UBtree_ptr;
-  Llu->Urbs = Urbs; 
-  Llu->Ucb_indptr = Ucb_indptr; 
-  Llu->Ucb_valptr = Ucb_valptr; 
-  
+  Llu->Urbs = Urbs;
+  Llu->Ucb_indptr = Ucb_indptr;
+  Llu->Ucb_valptr = Ucb_valptr;
+
 #if ( PRNTlevel>=1 )
   if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
 		     nLblocks, nUblocks);
 #endif
-  
+
   k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
   if ( !(Llu->mod_bit = intMalloc_dist(k)) )
       ABORT("Malloc fails for mod_bit[].");
 
   /* Find the maximum buffer size. */
-  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
+  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
 		MPI_MAX, grid->comm);
-  
+
 #if ( DEBUGlevel>=1 )
   /* Memory allocated but not freed:
      ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
@@ -2824,7 +2824,7 @@ doublecomplex *dense, *dense_col; /* SPA
   */
   CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
 #endif
-    
+
   return (- (memDist+memNLU));
 } /* zdist_psymbtonum */
 
diff -pruN 6.1.0+dfsg1-1/SRC/pzutil.c 6.1.1+dfsg1-1/SRC/pzutil.c
--- 6.1.0+dfsg1-1/SRC/pzutil.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/pzutil.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Several matrix utilities
  *
  * <pre>
@@ -42,11 +42,11 @@ int pzCompRow_loc_to_CompCol_global
     doublecomplex *a_recv;  /* Buffer to receive the blocks of values. */
     doublecomplex *a_buf;   /* Buffer to merge blocks into block columns. */
     int_t *itemp;
-    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
+    int_t *colptr_send; /* Buffer to redistribute the column pointers of the
 			   local block rows.
 			   Use n_loc+1 pointers for each block. */
     int_t *colptr_blk;  /* The column pointers for each block, after
-			   redistribution to the local block columns. 
+			   redistribution to the local block columns.
 			   Use n_loc+1 pointers for each block. */
     int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
     int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
@@ -164,7 +164,7 @@ int pzCompRow_loc_to_CompCol_global
                       a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX,
                       grid->comm);
     }
-      
+
     /* Reset colptr_loc[] to point to the n_loc global columns. */
     colptr_loc[0] = 0;
     itemp = colptr_send;
@@ -178,7 +178,7 @@ int pzCompRow_loc_to_CompCol_global
 	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
     }
     itemp[n_loc] = colptr_loc[n_loc];
-      
+
     /* Merge blocks of row indices into columns of row indices. */
     for (i = 0; i < procs; ++i) {
         k = i * (n_loc + 1);
@@ -219,12 +219,12 @@ int pzCompRow_loc_to_CompCol_global
     MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
     for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
     GAstore->nnz = nnz;
-    
+
     if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
         ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
     if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
         ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
-      
+
     /* Allgatherv for row indices. */
     rdispls[0] = 0;
     for (i = 0; i < procs-1; ++i) {
@@ -233,12 +233,12 @@ int pzCompRow_loc_to_CompCol_global
     }
     itemp_32[procs-1] = itemp[procs-1];
     it = nnz_loc;
-    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
+    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind,
 		   itemp_32, rdispls, mpi_int_t, grid->comm);
     if ( need_value ) {
       if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) )
           ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
-      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, 
+      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval,
 		     itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm);
     } else GAstore->nzval = NULL;
 
@@ -249,7 +249,7 @@ int pzCompRow_loc_to_CompCol_global
         itemp_32[i] = n_locs[i];
     }
     itemp_32[procs-1] = n_locs[procs-1];
-    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
+    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr,
 		   itemp_32, rdispls, mpi_int_t, grid->comm);
 
     /* Recompute column pointers. */
@@ -371,7 +371,7 @@ int pzPermute_Dense_Matrix
 	++ptr_to_ibuf[p];
 	ptr_to_dbuf[p] += nrhs;
     }
-	  
+
     /* Transfer the (permuted) row indices and numerical values. */
     MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
 		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
@@ -399,7 +399,7 @@ int pzPermute_Dense_Matrix
 
 /*! \brief Initialize the data structure for the solution phase.
  */
-int zSolveInit(superlu_dist_options_t *options, SuperMatrix *A, 
+int zSolveInit(superlu_dist_options_t *options, SuperMatrix *A,
 	       int_t perm_r[], int_t perm_c[], int_t nrhs,
 	       LUstruct_t *LUstruct, gridinfo_t *grid,
 	       SOLVEstruct_t *SOLVEstruct)
@@ -413,7 +413,7 @@ int zSolveInit(superlu_dist_options_t *o
     fst_row = Astore->fst_row;
     m_loc = Astore->m_loc;
     procs = grid->nprow * grid->npcol;
-    
+
     if ( !(row_to_proc = intMalloc_dist(A->nrow)) )
 	ABORT("Malloc fails for row_to_proc[]");
     SOLVEstruct->row_to_proc = row_to_proc;
@@ -425,9 +425,9 @@ int zSolveInit(superlu_dist_options_t *o
     /* ------------------------------------------------------------
        EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION.
        SET UP THE MAPPING BETWEEN ROWS AND PROCESSES.
-       
+
        NOTE: For those processes that do not own any row, it must
-             must be set so that fst_row == A->nrow. 
+             must be set so that fst_row == A->nrow.
        ------------------------------------------------------------*/
     if ( !(itemp = intMalloc_dist(procs+1)) )
         ABORT("Malloc fails for itemp[]");
@@ -462,7 +462,7 @@ int zSolveInit(superlu_dist_options_t *o
 	    for (i = j ; i < k; ++i) row_to_proc[i] = p;
 	}
     }
-#endif    
+#endif
 
     get_diag_procs(A->ncol, LUstruct->Glu_persist, grid,
 		   &SOLVEstruct->num_diag_procs,
@@ -473,14 +473,14 @@ int zSolveInit(superlu_dist_options_t *o
     if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *)
 	   SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
         ABORT("Malloc fails for gstrs_comm[]");
-    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
+    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid,
 		 LUstruct->Glu_persist, SOLVEstruct);
 
     if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *)
            SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) )
         ABORT("Malloc fails for gsmv_comm[]");
     SOLVEstruct->A_colind_gsmv = NULL;
-    
+
     options->SolveInitialized = YES;
     return 0;
 } /* zSolveInit */
@@ -506,10 +506,10 @@ void zSolveFinalize(superlu_dist_options
     options->SolveInitialized = NO;
 } /* zSolveFinalize */
 
-/*! \brief Check the inf-norm of the error vector 
+/*! \brief Check the inf-norm of the error vector
  */
 void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx,
-		      doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid) 
+		      doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid)
 {
     double err, xnorm, temperr, tempxnorm;
     doublecomplex *x_work, *xtrue_work;
diff -pruN 6.1.0+dfsg1-1/SRC/superlu_ddefs.h 6.1.1+dfsg1-1/SRC/superlu_ddefs.h
--- 6.1.0+dfsg1-1/SRC/superlu_ddefs.h	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/superlu_ddefs.h	2019-02-08 16:30:10.000000000 +0000
@@ -1,24 +1,25 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
+/*! @file
  * \brief  Distributed SuperLU data types and function prototypes
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.0) --
+ * -- Distributed SuperLU routine (version 6.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * November 1, 2007
  * April 5, 2015
  * September 18, 2018  version 6.0
+ * February 8, 2019  version 6.1.1
  * </pre>
  */
 
@@ -39,7 +40,7 @@ typedef struct {
     int_t indpos; /* Starting position in Uindex[]. */
 } Ucb_indptr_t;
 
-/* 
+/*
  * On each processor, the blocks in L are stored in compressed block
  * column format, the blocks in U are stored in compressed block row format.
  */
@@ -50,7 +51,7 @@ typedef struct {
     double **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc)                 */
     int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)  pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */
     int_t   *Unnz; /* number of nonzeros per block column in U*/
-	int_t   **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc)  map indices of Lrowind_bc_ptr to indices of lsum  */  
+	int_t   **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc)  map indices of Lrowind_bc_ptr to indices of lsum  */
     double  **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc)     	*/
     int_t   **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
     double  **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
@@ -73,7 +74,7 @@ typedef struct {
     int_t   bufmax[NBUFFERS]; /* Maximum buffer size across all MPI ranks:
 			       *  0 : maximum size of Lsub_buf[]
 			       *  1 : maximum size of Lval_buf[]
-			       *  2 : maximum size of Usub_buf[] 
+			       *  2 : maximum size of Usub_buf[]
 			       *  3 : maximum size of Uval_buf[]
 			       *  4 : maximum size of tempv[LDA]
 			       */
@@ -104,7 +105,7 @@ typedef struct {
     int_t   SolveMsgVol;      /* Volume of messages sent in the solve phase */
 
 
-    /*********************/	
+    /*********************/
     /* The following variables are used in the hybrid solver */
 
     /*-- Counts to be used in U^{-T} triangular solve. -- */
@@ -129,7 +130,7 @@ typedef struct {
     int_t n;
     int_t nleaf;
     int_t nfrecvmod;
-    int_t inv; /* whether the diagonal block is inverted*/	
+    int_t inv; /* whether the diagonal block is inverted*/
 } LocalLU_t;
 
 
@@ -167,7 +168,7 @@ typedef struct {
     int_t *row_to_proc;
     int_t *inv_perm_c;
     int_t num_diag_procs, *diag_procs, *diag_len;
-    pdgsmv_comm_t *gsmv_comm; /* communication metadata for SpMV, 
+    pdgsmv_comm_t *gsmv_comm; /* communication metadata for SpMV,
          	       		      required by IterRefine.          */
     pxgstrs_comm_t *gstrs_comm;  /* communication metadata for SpTRSV. */
     int_t *A_colind_gsmv; /* After pdgsmv_init(), the global column
@@ -207,7 +208,7 @@ extern void
 dCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, double *, int_t,
 			  Stype_t, Dtype_t, Mtype_t);
 extern void
-dCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *, 
+dCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *,
 			      int_t *, int_t *, int_t *, int_t *, int_t *,
 			      Stype_t, Dtype_t, Mtype_t);
 extern void
@@ -218,16 +219,16 @@ extern void    dallocateA_dist (int_t, i
 extern void    dGenXtrue_dist (int_t, int_t, double *, int_t);
 extern void    dFillRHS_dist (char *, int_t, double *, int_t,
                               SuperMatrix *, double *, int_t);
-extern int     dcreate_matrix(SuperMatrix *, int, double **, int *, 
+extern int     dcreate_matrix(SuperMatrix *, int, double **, int *,
 			      double **, int *, FILE *, gridinfo_t *);
-extern int     dcreate_matrix_rb(SuperMatrix *, int, double **, int *, 
+extern int     dcreate_matrix_rb(SuperMatrix *, int, double **, int *,
 			      double **, int *, FILE *, gridinfo_t *);
-extern int     dcreate_matrix_dat(SuperMatrix *, int, double **, int *, 
+extern int     dcreate_matrix_dat(SuperMatrix *, int, double **, int *,
 			      double **, int *, FILE *, gridinfo_t *);
-extern int 	   dcreate_matrix_postfix(SuperMatrix *, int, double **, int *, 
-				  double **, int *, FILE *, char *, gridinfo_t *);				  
-				  
-	
+extern int 	   dcreate_matrix_postfix(SuperMatrix *, int, double **, int *,
+				  double **, int *, FILE *, char *, gridinfo_t *);
+
+
 /* Driver related */
 extern void    dgsequ_dist (SuperMatrix *, double *, double *, double *,
 			    double *, double *, int_t *);
@@ -250,16 +251,16 @@ extern int     sp_dgemv_dist (char *, do
 extern int     sp_dgemm_dist (char *, int, double, SuperMatrix *,
                         double *, int, double, double *, int);
 
-extern float ddistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *, 
+extern float ddistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *,
 			 LUstruct_t *, gridinfo_t *);
-extern void  pdgssvx_ABglobal(superlu_dist_options_t *, SuperMatrix *, 
+extern void  pdgssvx_ABglobal(superlu_dist_options_t *, SuperMatrix *,
 			      ScalePermstruct_t *, double *,
 			      int, int, gridinfo_t *, LUstruct_t *, double *,
 			      SuperLUStat_t *, int *);
-extern float pddistribute(fact_t, int_t, SuperMatrix *, 
-			 ScalePermstruct_t *, Glu_freeable_t *, 
+extern float pddistribute(fact_t, int_t, SuperMatrix *,
+			 ScalePermstruct_t *, Glu_freeable_t *,
 			 LUstruct_t *, gridinfo_t *);
-extern void  pdgssvx(superlu_dist_options_t *, SuperMatrix *, 
+extern void  pdgssvx(superlu_dist_options_t *, SuperMatrix *,
 		     ScalePermstruct_t *, double *,
 		     int, int, gridinfo_t *, LUstruct_t *,
 		     SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
@@ -273,7 +274,7 @@ extern int_t pxgstrs_init(int_t, int_t,
 extern void pxgstrs_finalize(pxgstrs_comm_t *);
 extern int  dldperm_dist(int_t, int_t, int_t, int_t [], int_t [],
 		    double [], int_t *, double [], double []);
-extern int  static_schedule(superlu_dist_options_t *, int, int, 
+extern int  static_schedule(superlu_dist_options_t *, int, int,
 		            LUstruct_t *, gridinfo_t *, SuperLUStat_t *,
 			    int_t *, int_t *, int *);
 extern void LUstructInit(const int_t, LUstruct_t *);
@@ -293,7 +294,7 @@ extern void pdgstrs(int_t, LUstruct_t *,
 		    SuperLUStat_t *, int *);
 extern void dlsum_fmod(double *, double *, double *, double *,
 		       int, int, int_t , int_t *, int_t, int_t, int_t,
-		       int_t *, gridinfo_t *, LocalLU_t *, 
+		       int_t *, gridinfo_t *, LocalLU_t *,
 		       MPI_Request [], SuperLUStat_t *);
 extern void dlsum_bmod(double *, double *, double *,
                        int, int_t, int_t *, int_t *, Ucb_indptr_t **,
@@ -302,21 +303,21 @@ extern void dlsum_bmod(double *, double
 
 extern void dlsum_fmod_inv(double *, double *, double *, double *,
 		       int, int_t , int_t *,
-		       int_t *, gridinfo_t *, LocalLU_t *, 
+		       int_t *, gridinfo_t *, LocalLU_t *,
 		       SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int_t, int_t, int, int);
 extern void dlsum_fmod_inv_master(double *, double *, double *, double *,
-		       int, int, int_t , int_t *, int_t, 
-		       int_t *, gridinfo_t *, LocalLU_t *, 
+		       int, int, int_t , int_t *, int_t,
+		       int_t *, gridinfo_t *, LocalLU_t *,
 		       SuperLUStat_t **, int_t, int_t, int_t, int_t, int, int);
 extern void dlsum_bmod_inv(double *, double *, double *, double *,
-                       int, int_t, int_t *, int_t *, int_t *, Ucb_indptr_t **,
+                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
                        int_t **, int_t *, gridinfo_t *, LocalLU_t *,
-		       MPI_Request [], SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int, int);
+		       SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int, int);
 extern void dlsum_bmod_inv_master(double *, double *, double *, double *,
-                       int, int_t, int_t *, int_t *, int_t *, Ucb_indptr_t **,
+                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
                        int_t **, int_t *, gridinfo_t *, LocalLU_t *,
-		       MPI_Request [], SuperLUStat_t **, int_t, int_t, int, int);			   
-			   
+		       SuperLUStat_t **, int_t, int_t, int, int);
+
 extern void pdgsrfs(int_t, SuperMatrix *, double, LUstruct_t *,
 		    ScalePermstruct_t *, gridinfo_t *,
 		    double [], int_t, double [], int_t, int,
@@ -358,22 +359,22 @@ extern void    dinf_norm_error_dist (int
                                      double*, int_t, gridinfo_t*);
 extern void    pdinf_norm_error(int, int_t, int_t, double [], int_t,
 				double [], int_t , gridinfo_t *);
-extern void  dreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, 
+extern void  dreadhb_dist (int, FILE *, int_t *, int_t *, int_t *,
 			   double **, int_t **, int_t **);
 extern void  dreadtriple_dist(FILE *, int_t *, int_t *, int_t *,
 			 double **, int_t **, int_t **);
 extern void  dreadtriple_noheader(FILE *, int_t *, int_t *, int_t *,
-			 double **, int_t **, int_t **);			 
+			 double **, int_t **, int_t **);
 extern void  dreadrb_dist(int, FILE *, int_t *, int_t *, int_t *,
 		     double **, int_t **, int_t **);
 extern void  dreadMM_dist(FILE *, int_t *, int_t *, int_t *,
 	                  double **, int_t **, int_t **);
 extern int  dread_binary(FILE *, int_t *, int_t *, int_t *,
-	                  double **, int_t **, int_t **);	
-					  
+	                  double **, int_t **, int_t **);
+
 /* Distribute the data for numerical factorization */
 extern float ddist_psymbtonum(fact_t, int_t, SuperMatrix *,
-                                ScalePermstruct_t *, Pslu_freeable_t *, 
+                                ScalePermstruct_t *, Pslu_freeable_t *,
                                 LUstruct_t *, gridinfo_t *);
 extern void pdGetDiagU(int_t, LUstruct_t *, gridinfo_t *, double *);
 
@@ -386,7 +387,7 @@ extern void  dPrintUblocks(int, int_t, g
 extern void  dPrint_CompCol_Matrix_dist(SuperMatrix *);
 extern void  dPrint_Dense_Matrix_dist(SuperMatrix *);
 extern int   dPrint_CompRowLoc_Matrix_dist(SuperMatrix *);
-extern int   file_dPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A);																			   
+extern int   file_dPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A);
 extern int   file_PrintDouble5(FILE *, char *, int_t, double *);
 
 
@@ -398,12 +399,12 @@ extern void dgemm_(const char*, const ch
                   const int*, const double*, double*, const int*, int, int);
 extern void dtrsv_(char*, char*, char*, int*, double*, int*,
                   double*, int*, int, int, int);
-extern void dtrsm_(char*, char*, char*, char*, int*, int*, 
-                  double*, double*, int*, double*, 
+extern void dtrsm_(char*, char*, char*, char*, int*, int*,
+                  double*, double*, int*, double*,
                   int*, int, int, int, int);
-extern void dgemv_(char *, int *, int *, double *, double *a, int *, 
+extern void dgemv_(char *, int *, int *, double *, double *a, int *,
                   double *, int *, double *, double *, int *, int);
-extern void dtrtri_(char*, char*, int*, double*, int*,int*);				 
+extern void dtrtri_(char*, char*, int*, double*, int*,int*);
 
 extern void dger_(int*, int*, double*, double*, int*,
                  double*, int*, double*, int*);
@@ -414,9 +415,9 @@ extern int dgemm_(const char*, const cha
                    const int*,  const double*, double*, const int*);
 extern int dtrsv_(char*, char*, char*, int*, double*, int*,
                   double*, int*);
-extern int dtrsm_(char*, char*, char*, char*, int*, int*, 
+extern int dtrsm_(char*, char*, char*, char*, int*, int*,
                   double*, double*, int*, double*, int*);
-extern int dgemv_(char *, int *, int *, double *, double *a, int *, 
+extern int dgemv_(char *, int *, int *, double *, double *a, int *,
                   double *, int *, double *, double *, int *);
 extern void dger_(int*, int*, double*, double*, int*,
                  double*, int*, double*, int*);
diff -pruN 6.1.0+dfsg1-1/SRC/superlu_defs.h 6.1.1+dfsg1-1/SRC/superlu_defs.h
--- 6.1.0+dfsg1-1/SRC/superlu_defs.h	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/superlu_defs.h	2019-02-08 16:30:10.000000000 +0000
@@ -17,9 +17,10 @@ at the top-level directory.
  * November 1, 2007
  *
  * Modified:
- *     Feburary 20, 2008
+ *     February 20, 2008
  *     October 11, 2014
  *     September 18, 2018  version 6.0
+ *     February 8, 2019
  * </pre>
  */
 
@@ -70,8 +71,8 @@ at the top-level directory.
  */
 #define SUPERLU_DIST_MAJOR_VERSION     6
 #define SUPERLU_DIST_MINOR_VERSION     1
-#define SUPERLU_DIST_PATCH_VERSION     0
-#define SUPERLU_DIST_RELEASE_DATE      "December 6, 2018"
+#define SUPERLU_DIST_PATCH_VERSION     1
+#define SUPERLU_DIST_RELEASE_DATE      "February 8, 2019"
 
 #include "superlu_dist_config.h"
 /* Define my integer size int_t */
diff -pruN 6.1.0+dfsg1-1/SRC/superlu_grid.c 6.1.1+dfsg1-1/SRC/superlu_grid.c
--- 6.1.0+dfsg1-1/SRC/superlu_grid.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/superlu_grid.c	2019-02-08 16:30:10.000000000 +0000
@@ -12,9 +12,10 @@ at the top-level directory.
  * \brief SuperLU grid utilities
  *
  * <pre>
- * -- Distributed SuperLU routine (version 1.0) --
+ * -- Distributed SuperLU routine (version 6.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * September 1, 1999
+ * February 8, 2019  version 6.1.1
  * </pre>
  */
 
@@ -150,7 +151,7 @@ void superlu_gridmap(
     {
 	int tag_ub;
 	if ( !grid->iam ) {
-	    MPI_Attr_get(Bcomm, MPI_TAG_UB, &tag_ub, &info);
+	    MPI_Comm_get_attr(Bcomm, MPI_TAG_UB, &tag_ub, &info);
 	    printf("MPI_TAG_UB %d\n", tag_ub);
 	    /* returns 4295677672
 	       In reality it is restricted to no greater than 16384. */
diff -pruN 6.1.0+dfsg1-1/SRC/superlu_zdefs.h 6.1.1+dfsg1-1/SRC/superlu_zdefs.h
--- 6.1.0+dfsg1-1/SRC/superlu_zdefs.h	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/superlu_zdefs.h	2019-02-08 16:30:10.000000000 +0000
@@ -1,23 +1,24 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief  Distributed SuperLU data types and function prototypes
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.0) --
+ * -- Distributed SuperLU routine (version 6.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * November 1, 2007
  * April 5, 2015
  * September 18, 2018  version 6.0
+ * February 8, 2019  version 6.1.1
  * </pre>
  */
 
@@ -39,7 +40,7 @@ typedef struct {
     int_t indpos; /* Starting position in Uindex[]. */
 } Ucb_indptr_t;
 
-/* 
+/*
  * On each processor, the blocks in L are stored in compressed block
  * column format, the blocks in U are stored in compressed block row format.
  */
@@ -50,7 +51,7 @@ typedef struct {
     doublecomplex **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc)                 */
     int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)  pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */
     int_t   *Unnz; /* number of nonzeros per block column in U*/
-	int_t   **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc)  map indices of Lrowind_bc_ptr to indices of lsum  */  
+	int_t   **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc)  map indices of Lrowind_bc_ptr to indices of lsum  */
     doublecomplex  **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc)     	*/
     int_t   **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
     doublecomplex  **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
@@ -73,7 +74,7 @@ typedef struct {
     int_t   bufmax[NBUFFERS]; /* Maximum buffer size across all MPI ranks:
 			       *  0 : maximum size of Lsub_buf[]
 			       *  1 : maximum size of Lval_buf[]
-			       *  2 : maximum size of Usub_buf[] 
+			       *  2 : maximum size of Usub_buf[]
 			       *  3 : maximum size of Uval_buf[]
 			       *  4 : maximum size of tempv[LDA]
 			       */
@@ -104,7 +105,7 @@ typedef struct {
     int_t   SolveMsgVol;      /* Volume of messages sent in the solve phase */
 
 
-    /*********************/	
+    /*********************/
     /* The following variables are used in the hybrid solver */
 
     /*-- Counts to be used in U^{-T} triangular solve. -- */
@@ -129,7 +130,7 @@ typedef struct {
     int_t n;
     int_t nleaf;
     int_t nfrecvmod;
-    int_t inv; /* whether the diagonal block is inverted*/	
+    int_t inv; /* whether the diagonal block is inverted*/
 } LocalLU_t;
 
 
@@ -167,7 +168,7 @@ typedef struct {
     int_t *row_to_proc;
     int_t *inv_perm_c;
     int_t num_diag_procs, *diag_procs, *diag_len;
-    pzgsmv_comm_t *gsmv_comm; /* communication metadata for SpMV, 
+    pzgsmv_comm_t *gsmv_comm; /* communication metadata for SpMV,
          	       		      required by IterRefine.          */
     pxgstrs_comm_t *gstrs_comm;  /* communication metadata for SpTRSV. */
     int_t *A_colind_gsmv; /* After pzgsmv_init(), the global column
@@ -207,7 +208,7 @@ extern void
 zCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, doublecomplex *, int_t,
 			  Stype_t, Dtype_t, Mtype_t);
 extern void
-zCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, doublecomplex *, 
+zCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, doublecomplex *,
 			      int_t *, int_t *, int_t *, int_t *, int_t *,
 			      Stype_t, Dtype_t, Mtype_t);
 extern void
@@ -218,16 +219,16 @@ extern void    zallocateA_dist (int_t, i
 extern void    zGenXtrue_dist (int_t, int_t, doublecomplex *, int_t);
 extern void    zFillRHS_dist (char *, int_t, doublecomplex *, int_t,
                               SuperMatrix *, doublecomplex *, int_t);
-extern int     zcreate_matrix(SuperMatrix *, int, doublecomplex **, int *, 
+extern int     zcreate_matrix(SuperMatrix *, int, doublecomplex **, int *,
 			      doublecomplex **, int *, FILE *, gridinfo_t *);
-extern int     zcreate_matrix_rb(SuperMatrix *, int, doublecomplex **, int *, 
+extern int     zcreate_matrix_rb(SuperMatrix *, int, doublecomplex **, int *,
 			      doublecomplex **, int *, FILE *, gridinfo_t *);
-extern int     zcreate_matrix_dat(SuperMatrix *, int, doublecomplex **, int *, 
+extern int     zcreate_matrix_dat(SuperMatrix *, int, doublecomplex **, int *,
 			      doublecomplex **, int *, FILE *, gridinfo_t *);
-extern int 	   zcreate_matrix_postfix(SuperMatrix *, int, doublecomplex **, int *, 
-				  doublecomplex **, int *, FILE *, char *, gridinfo_t *);				  
-				  
-	
+extern int 	   zcreate_matrix_postfix(SuperMatrix *, int, doublecomplex **, int *,
+				  doublecomplex **, int *, FILE *, char *, gridinfo_t *);
+
+
 /* Driver related */
 extern void    zgsequ_dist (SuperMatrix *, double *, double *, double *,
 			    double *, double *, int_t *);
@@ -250,16 +251,16 @@ extern int     sp_zgemv_dist (char *, do
 extern int     sp_zgemm_dist (char *, int, doublecomplex, SuperMatrix *,
                         doublecomplex *, int, doublecomplex, doublecomplex *, int);
 
-extern float zdistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *, 
+extern float zdistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *,
 			 LUstruct_t *, gridinfo_t *);
-extern void  pzgssvx_ABglobal(superlu_dist_options_t *, SuperMatrix *, 
+extern void  pzgssvx_ABglobal(superlu_dist_options_t *, SuperMatrix *,
 			      ScalePermstruct_t *, doublecomplex *,
 			      int, int, gridinfo_t *, LUstruct_t *, double *,
 			      SuperLUStat_t *, int *);
-extern float pzdistribute(fact_t, int_t, SuperMatrix *, 
-			 ScalePermstruct_t *, Glu_freeable_t *, 
+extern float pzdistribute(fact_t, int_t, SuperMatrix *,
+			 ScalePermstruct_t *, Glu_freeable_t *,
 			 LUstruct_t *, gridinfo_t *);
-extern void  pzgssvx(superlu_dist_options_t *, SuperMatrix *, 
+extern void  pzgssvx(superlu_dist_options_t *, SuperMatrix *,
 		     ScalePermstruct_t *, doublecomplex *,
 		     int, int, gridinfo_t *, LUstruct_t *,
 		     SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
@@ -273,7 +274,7 @@ extern int_t pxgstrs_init(int_t, int_t,
 extern void pxgstrs_finalize(pxgstrs_comm_t *);
 extern int  zldperm_dist(int_t, int_t, int_t, int_t [], int_t [],
 		    doublecomplex [], int_t *, double [], double []);
-extern int  static_schedule(superlu_dist_options_t *, int, int, 
+extern int  static_schedule(superlu_dist_options_t *, int, int,
 		            LUstruct_t *, gridinfo_t *, SuperLUStat_t *,
 			    int_t *, int_t *, int *);
 extern void LUstructInit(const int_t, LUstruct_t *);
@@ -293,7 +294,7 @@ extern void pzgstrs(int_t, LUstruct_t *,
 		    SuperLUStat_t *, int *);
 extern void zlsum_fmod(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *,
 		       int, int, int_t , int_t *, int_t, int_t, int_t,
-		       int_t *, gridinfo_t *, LocalLU_t *, 
+		       int_t *, gridinfo_t *, LocalLU_t *,
 		       MPI_Request [], SuperLUStat_t *);
 extern void zlsum_bmod(doublecomplex *, doublecomplex *, doublecomplex *,
                        int, int_t, int_t *, int_t *, Ucb_indptr_t **,
@@ -302,21 +303,21 @@ extern void zlsum_bmod(doublecomplex *,
 
 extern void zlsum_fmod_inv(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *,
 		       int, int_t , int_t *,
-		       int_t *, gridinfo_t *, LocalLU_t *, 
+		       int_t *, gridinfo_t *, LocalLU_t *,
 		       SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int_t, int_t, int, int);
 extern void zlsum_fmod_inv_master(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *,
-		       int, int, int_t , int_t *, int_t, 
-		       int_t *, gridinfo_t *, LocalLU_t *, 
+		       int, int, int_t , int_t *, int_t,
+		       int_t *, gridinfo_t *, LocalLU_t *,
 		       SuperLUStat_t **, int_t, int_t, int_t, int_t, int, int);
 extern void zlsum_bmod_inv(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *,
-                       int, int_t, int_t *, int_t *, int_t *, Ucb_indptr_t **,
+                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
                        int_t **, int_t *, gridinfo_t *, LocalLU_t *,
-		       MPI_Request [], SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int, int);
+		       SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int, int);
 extern void zlsum_bmod_inv_master(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *,
-                       int, int_t, int_t *, int_t *, int_t *, Ucb_indptr_t **,
+                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
                        int_t **, int_t *, gridinfo_t *, LocalLU_t *,
-		       MPI_Request [], SuperLUStat_t **, int_t, int_t, int, int);			   
-			   
+		       SuperLUStat_t **, int_t, int_t, int, int);
+
 extern void pzgsrfs(int_t, SuperMatrix *, double, LUstruct_t *,
 		    ScalePermstruct_t *, gridinfo_t *,
 		    doublecomplex [], int_t, doublecomplex [], int_t, int,
@@ -360,22 +361,22 @@ extern void    zinf_norm_error_dist (int
                                      doublecomplex*, int_t, gridinfo_t*);
 extern void    pzinf_norm_error(int, int_t, int_t, doublecomplex [], int_t,
 				doublecomplex [], int_t , gridinfo_t *);
-extern void  zreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, 
+extern void  zreadhb_dist (int, FILE *, int_t *, int_t *, int_t *,
 			   doublecomplex **, int_t **, int_t **);
 extern void  zreadtriple_dist(FILE *, int_t *, int_t *, int_t *,
 			 doublecomplex **, int_t **, int_t **);
 extern void  zreadtriple_noheader(FILE *, int_t *, int_t *, int_t *,
-			 doublecomplex **, int_t **, int_t **);			 
+			 doublecomplex **, int_t **, int_t **);
 extern void  zreadrb_dist(int, FILE *, int_t *, int_t *, int_t *,
 		     doublecomplex **, int_t **, int_t **);
 extern void  zreadMM_dist(FILE *, int_t *, int_t *, int_t *,
 	                  doublecomplex **, int_t **, int_t **);
 extern int  zread_binary(FILE *, int_t *, int_t *, int_t *,
-	                  doublecomplex **, int_t **, int_t **);	
-					  
+	                  doublecomplex **, int_t **, int_t **);
+
 /* Distribute the data for numerical factorization */
 extern float zdist_psymbtonum(fact_t, int_t, SuperMatrix *,
-                                ScalePermstruct_t *, Pslu_freeable_t *, 
+                                ScalePermstruct_t *, Pslu_freeable_t *,
                                 LUstruct_t *, gridinfo_t *);
 extern void pzGetDiagU(int_t, LUstruct_t *, gridinfo_t *, doublecomplex *);
 
@@ -388,7 +389,7 @@ extern void  zPrintUblocks(int, int_t, g
 extern void  zPrint_CompCol_Matrix_dist(SuperMatrix *);
 extern void  zPrint_Dense_Matrix_dist(SuperMatrix *);
 extern int   zPrint_CompRowLoc_Matrix_dist(SuperMatrix *);
-extern int   file_zPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A);																			   
+extern int   file_zPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A);
 extern void  PrintDoublecomplex(char *, int_t, doublecomplex *);
 extern int   file_PrintDoublecomplex(FILE *fp, char *, int_t, doublecomplex *);
 
@@ -401,12 +402,12 @@ extern void zgemm_(const char*, const ch
                   const int*, const doublecomplex*, doublecomplex*, const int*, int, int);
 extern void ztrsv_(char*, char*, char*, int*, doublecomplex*, int*,
                   doublecomplex*, int*, int, int, int);
-extern void ztrsm_(char*, char*, char*, char*, int*, int*, 
-                  doublecomplex*, doublecomplex*, int*, doublecomplex*, 
+extern void ztrsm_(char*, char*, char*, char*, int*, int*,
+                  doublecomplex*, doublecomplex*, int*, doublecomplex*,
                   int*, int, int, int, int);
-extern void zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *, 
+extern void zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *,
                   doublecomplex *, int *, doublecomplex *, doublecomplex *, int *, int);
-extern void ztrtri_(char*, char*, int*, doublecomplex*, int*,int*);				 
+extern void ztrtri_(char*, char*, int*, doublecomplex*, int*,int*);
 
 extern void zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*,
                  doublecomplex*, int*, doublecomplex*, int*);
@@ -417,9 +418,9 @@ extern int zgemm_(const char*, const cha
                    const int*,  const doublecomplex*, doublecomplex*, const int*);
 extern int ztrsv_(char*, char*, char*, int*, doublecomplex*, int*,
                   doublecomplex*, int*);
-extern int ztrsm_(char*, char*, char*, char*, int*, int*, 
+extern int ztrsm_(char*, char*, char*, char*, int*, int*,
                   doublecomplex*, doublecomplex*, int*, doublecomplex*, int*);
-extern int zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *, 
+extern int zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *,
                   doublecomplex *, int *, doublecomplex *, doublecomplex *, int *);
 extern int zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*,
                  doublecomplex*, int*, doublecomplex*, int*);
diff -pruN 6.1.0+dfsg1-1/SRC/zdistribute.c 6.1.1+dfsg1-1/SRC/zdistribute.c
--- 6.1.0+dfsg1-1/SRC/zdistribute.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zdistribute.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Distribute the matrix onto the 2D process mesh.
  *
  * <pre>
@@ -27,10 +27,10 @@ at the top-level directory.
  * Purpose
  * =======
  *   Distribute the matrix onto the 2D process mesh.
- * 
+ *
  * Arguments
  * =========
- * 
+ *
  * fact (input) fact_t
  *        Specifies whether or not the L and U structures will be re-used.
  *        = SamePattern_SameRowPerm: L and U structures are input, and
@@ -59,22 +59,22 @@ at the top-level directory.
  */
 
 float
-zdistribute(fact_t fact, int_t n, SuperMatrix *A, 
+zdistribute(fact_t fact, int_t n, SuperMatrix *A,
             Glu_freeable_t *Glu_freeable,
 	    LUstruct_t *LUstruct, gridinfo_t *grid)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
-    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, 
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1,
           len, len1, nsupc;
 	int_t lib;  /* local block row number */
-	int_t nlb;  /* local block rows*/		  
+	int_t nlb;  /* local block rows*/
     int_t ljb;  /* local block column number */
     int_t nrbl; /* number of L blocks in current block column */
     int_t nrbu; /* number of U blocks in current block column */
     int_t gb;   /* global block number; 0 < gb <= nsuper */
     int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-	int_t ub,gik,iklrow,fnz;    
+	int_t ub,gik,iklrow,fnz;
 	int iam, jbrow, kcol, krow, mycol, myrow, pc, pr;
     int_t mybufmax[NBUFFERS];
     NCPformat *Astore;
@@ -82,31 +82,31 @@ zdistribute(fact_t fact, int_t n, SuperM
     int_t *asub;
     int_t *xa_begin, *xa_end;
     int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-    int_t *supno = Glu_persist->supno;   
+    int_t *supno = Glu_persist->supno;
     int_t *lsub, *xlsub, *usub, *usub1, *xusub;
     int_t nsupers;
     int_t next_lind;      /* next available position in index[*] */
     int_t next_lval;      /* next available position in nzval[*] */
     int_t *index;         /* indices consist of headers and row subscripts */
-	int_t *index_srt;         /* indices consist of headers and row subscripts */     
+	int_t *index_srt;         /* indices consist of headers and row subscripts */
 	int   *index1;        /* temporary pointer to array of int */
     doublecomplex *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
     doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-	int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */		    
-	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */			
+	int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */
     doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
     int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
 	BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
 	RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
 	BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */	
+	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
 	int msgsize;
 
     int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
     Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */  	
-	
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+
     /*-- Counts to be used in factorization. --*/
     int  *ToRecv, *ToSendD, **ToSendR;
 
@@ -122,7 +122,7 @@ zdistribute(fact_t fact, int_t n, SuperM
     int_t  **bsendx_plist; /* Column process list to send down Xk.   */
     int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
     int_t  nbsendx = 0;    /* Number of Xk I will send               */
-    int_t  *ilsum;         /* starting position of each supernode in 
+    int_t  *ilsum;         /* starting position of each supernode in
 			      the full array (local)                 */
 
     /*-- Auxiliary arrays; freed on return --*/
@@ -142,7 +142,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 	int_t *idxs;
 	int_t **nzrows;
 	double rseed;
-	int rank_cnt,rank_cnt_ref,Root;    	
+	int rank_cnt,rank_cnt_ref,Root;
     doublecomplex *dense, *dense_col; /* SPA */
     doublecomplex zero = {0.0, 0.0};
     int_t  ldaspa;     /* LDA of SPA */
@@ -153,18 +153,18 @@ zdistribute(fact_t fact, int_t n, SuperM
     int_t *frecv, *brecv, *lloc;
     doublecomplex **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     doublecomplex **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-    double *SeedSTD_BC,*SeedSTD_RD;				 
+    double *SeedSTD_BC,*SeedSTD_RD;
     int_t idx_indx,idx_lusup;
     int_t nbrow;
     int_t  ik, il, lk, rel, knsupc, idx_r;
     int_t  lptr1_tmp, idx_i, idx_v,m, uu;
     int_t nub;
-    int tag;		
-	
+    int tag;
+
 #if ( PRNTlevel>=1 )
     int_t nLblocks = 0, nUblocks = 0;
 #endif
-#if ( PROFlevel>=1 ) 
+#if ( PROFlevel>=1 )
     double t, t_u, t_l;
     int_t u_blks;
 #endif
@@ -213,7 +213,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
 	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
 	Unzval_br_ptr = Llu->Unzval_br_ptr;
-	Unnz = Llu->Unnz;	
+	Unnz = Llu->Unnz;
 
 	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*zword;
 
@@ -320,9 +320,9 @@ zdistribute(fact_t fact, int_t n, SuperM
 			   t_l, t_u, u_blks, nrbu);
 #endif
 
-    } else { 
+    } else {
         /* --------------------------------------------------
-         * FIRST TIME CREATING THE L AND U DATA STRUCTURE. 
+         * FIRST TIME CREATING THE L AND U DATA STRUCTURE.
          * -------------------------------------------------- */
 
 #if ( PROFlevel>=1 )
@@ -335,7 +335,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 	xlsub = Glu_freeable->xlsub;
 	usub = Glu_freeable->usub;    /* compressed U subscripts */
 	xusub = Glu_freeable->xusub;
-    
+
 	if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) )
 	    ABORT("Malloc fails for ToRecv[].");
 	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
@@ -354,12 +354,12 @@ zdistribute(fact_t fact, int_t n, SuperM
 	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
 
 	/* Pointers to the beginning of each block row of U. */
-	if ( !(Unzval_br_ptr = 
+	if ( !(Unzval_br_ptr =
                (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
 	    ABORT("Malloc fails for Unzval_br_ptr[].");
 	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
-	
+
 	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
 	    ABORT("Malloc fails for ToSendD[].");
 	for (i = 0; i < k; ++i) ToSendD[i] = NO;
@@ -392,13 +392,13 @@ zdistribute(fact_t fact, int_t n, SuperM
 		ilsum[lb + 1] = ilsum[lb] + i;
 	    }
 	}
-	
-            
+
+
 	/* ------------------------------------------------------------
 	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
 	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
 	   ------------------------------------------------------------*/
-	
+
 	/* Loop through each supernode column. */
 	for (jb = 0; jb < nsupers; ++jb) {
 	    pc = PCOL( jb, grid );
@@ -435,7 +435,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 		} /* for i ... */
 	    } /* for j ... */
 	} /* for jb ... */
-	
+
 	/* Set up the initial pointers for each block row in U. */
 	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
 	for (lb = 0; lb < nrbu; ++lb) {
@@ -493,7 +493,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 	    ABORT("Calloc fails for fmod[].");
 	if ( !(bmod = intCalloc_dist(k)) )
 	    ABORT("Calloc fails for bmod[].");
-#if ( PRNTlevel>=1 )	
+#if ( PRNTlevel>=1 )
 	mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*zword;
 #endif
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
@@ -505,26 +505,26 @@ zdistribute(fact_t fact, int_t n, SuperM
 	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
 	Lrowind_bc_ptr[k-1] = NULL;
 
-	if ( !(Lindval_loc_bc_ptr = 
+	if ( !(Lindval_loc_bc_ptr =
 				(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 		ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
 	Lindval_loc_bc_ptr[k-1] = NULL;
 
-	if ( !(Linv_bc_ptr = 
+	if ( !(Linv_bc_ptr =
 				(doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) {
 		fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
-	}  
-	if ( !(Uinv_bc_ptr = 
+	}
+	if ( !(Uinv_bc_ptr =
 				(doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) {
 		fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
-	}  
+	}
 	Linv_bc_ptr[k-1] = NULL;
-	Uinv_bc_ptr[k-1] = NULL;	
-	
-	if ( !(Unnz = 
+	Uinv_bc_ptr[k-1] = NULL;
+
+	if ( !(Unnz =
 			(int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
-	ABORT("Malloc fails for Unnz[].");	
-	
+	ABORT("Malloc fails for Unnz[].");
+
 	/* These lists of processes will be used for triangular solves. */
 	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
 	    ABORT("Malloc fails for fsendx_plist[].");
@@ -555,7 +555,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 		fsupc = FstBlockC( jb );
 		nsupc = SuperSize( jb );
 		ljb = LBj( jb, grid ); /* Local block number */
-		
+
 		/* Scatter A into SPA. */
 		for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){
 		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
@@ -600,7 +600,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 			    index = Ufstnz_br_ptr[lb];
 			    uval = Unzval_br_ptr[lb];
 			    fsupc1 = FstBlockC( gb+1 );
-			    if (rb_marker[lb] <= jb) { /* First time see 
+			    if (rb_marker[lb] <= jb) { /* First time see
 							  the block       */
 				rb_marker[lb] = jb + 1;
 				Urb_indptr[lb] = Urb_fstnz[lb];;
@@ -685,15 +685,15 @@ zdistribute(fact_t fact, int_t n, SuperM
 		} /* for i ... */
 
 		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-		    /* Set up the initial pointers for each block in 
+		    /* Set up the initial pointers for each block in
 		       index[] and nzval[]. */
 		    /* Add room for descriptors */
 		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-			if ( !(index = intMalloc_dist(len1)) ) 
-				ABORT("Malloc fails for index[]");												 			 
+			if ( !(index = intMalloc_dist(len1)) )
+				ABORT("Malloc fails for index[]");
 			if (!(lusup = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex))))
-				ABORT("Malloc fails for lusup[]");			
-			if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) )) 
+				ABORT("Malloc fails for lusup[]");
+			if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) ))
 				ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
 			if (!(Linv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
 				ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
@@ -712,10 +712,10 @@ zdistribute(fact_t fact, int_t n, SuperM
 			len = Lrb_length[lb];
 			Lindval_loc_bc_ptr[ljb][k] = lb;
 			Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind;
-			Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;				
+			Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;
 			Lrb_length[lb] = 0;  /* Reset vector of block length */
 			index[next_lind++] = gb; /* Descriptor */
-			index[next_lind++] = len; 
+			index[next_lind++] = len;
 			Lrb_indptr[lb] = next_lind;
 			Lrb_valptr[lb] = next_lval;
 			next_lind += len;
@@ -743,7 +743,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 			}
 		    } /* for i ... */
 			Lrowind_bc_ptr[ljb] = index;
-			Lnzval_bc_ptr[ljb] = lusup; 
+			Lnzval_bc_ptr[ljb] = lusup;
 
 
 			/* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] and Lnzval_bc_ptr[ljb] here*/
@@ -753,15 +753,15 @@ zdistribute(fact_t fact, int_t n, SuperM
 					uu=nrbl-2;
 					lloc = &Lindval_loc_bc_ptr[ljb][1];
 				}else{
-					uu=nrbl-1;	
+					uu=nrbl-1;
 					lloc = Lindval_loc_bc_ptr[ljb];
-				}	
-				quickSortM(lloc,0,uu,nrbl,0,3);	
+				}
+				quickSortM(lloc,0,uu,nrbl,0,3);
 			}
 
 
-			if ( !(index_srt = intMalloc_dist(len1)) ) 
-				ABORT("Malloc fails for index_srt[]");				
+			if ( !(index_srt = intMalloc_dist(len1)) )
+				ABORT("Malloc fails for index_srt[]");
 			if (!(lusup_srt = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex))))
 				ABORT("Malloc fails for lusup_srt[]");
 
@@ -776,26 +776,26 @@ zdistribute(fact_t fact, int_t n, SuperM
 					index_srt[idx_indx++] = index[Lindval_loc_bc_ptr[ljb][i+nrbl]+jj];
 				}
 
-				Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow; 
+				Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow;
 
 				for (jj=0;jj<nbrow;jj++){
 					k=idx_lusup;
 					k1=Lindval_loc_bc_ptr[ljb][i+nrbl*2]+jj;
-					for (j = 0; j < nsupc; ++j) {				
+					for (j = 0; j < nsupc; ++j) {
 						lusup_srt[k] = lusup[k1];
 						k += len;
 						k1 += len;
-					}	
+					}
 					idx_lusup++;
-				}				
-				Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;	
+				}
+				Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;
 			}
 
 			SUPERLU_FREE(lusup);
 			SUPERLU_FREE(index);
 
 			Lrowind_bc_ptr[ljb] = index_srt;
-			Lnzval_bc_ptr[ljb] = lusup_srt; 			
+			Lnzval_bc_ptr[ljb] = lusup_srt;
 
 			// if(ljb==0)
 			// for (jj=0;jj<nrbl*3;jj++){
@@ -804,15 +804,15 @@ zdistribute(fact_t fact, int_t n, SuperM
 			// }
 			// for (jj=0;jj<nrbl;jj++){
 			// printf("iam %5d Lindval %5d\n",iam, index[Lindval_loc_bc_ptr[ljb][jj+nrbl]]);
-			// fflush(stdout);			
+			// fflush(stdout);
 
-			// }	
+			// }
 		} else {
 		    Lrowind_bc_ptr[ljb] = NULL;
 		    Lnzval_bc_ptr[ljb] = NULL;
 			Linv_bc_ptr[ljb] = NULL;
 			Uinv_bc_ptr[ljb] = NULL;
-			Lindval_loc_bc_ptr[ljb] = NULL;			
+			Lindval_loc_bc_ptr[ljb] = NULL;
 		} /* if nrbl ... */
 #if ( PROFlevel>=1 )
 		t_l += SuperLU_timer_() - t;
@@ -822,7 +822,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 	} /* for jb ... */
 
 	/////////////////////////////////////////////////////////////////
-	
+
 	/* Set up additional pointers for the index and value arrays of U.
 	   nub is the number of local block columns. */
 	nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */
@@ -836,7 +836,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 		ABORT("Malloc fails for Ucb_valptr[]");
 	nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
 
-	/* Count number of row blocks in a block column. 
+	/* Count number of row blocks in a block column.
 	   One pass of the skeleton graph of U. */
 	for (lk = 0; lk < nlb; ++lk) {
 		usub1 = Ufstnz_br_ptr[lk];
@@ -875,20 +875,20 @@ zdistribute(fact_t fact, int_t n, SuperM
 
 				Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
 				Ucb_valptr[ljb][Urbs1[ljb]] = j;
-				
+
 				++Urbs1[ljb];
 				j += usub1[i+1];
 				i += UB_DESCRIPTOR + SuperSize( k );
 			}
 		}
-	}				
-	
+	}
+
 
-/* Count the nnzs per block column */	
+/* Count the nnzs per block column */
 	for (lb = 0; lb < nub; ++lb) {
 		Unnz[lb] = 0;
 		k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
-		knsupc = SuperSize( k );	
+		knsupc = SuperSize( k );
 		for (ub = 0; ub < Urbs[lb]; ++ub) {
 			ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */
 			i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */
@@ -902,40 +902,40 @@ zdistribute(fact_t fact, int_t n, SuperM
 				}
 			} /* for jj ... */
 		}
-	}			
-	
+	}
+
 	/////////////////////////////////////////////////////////////////
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif				
+#endif
 	/* construct the Bcast tree for L ... */
 
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
 	if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 		ABORT("Malloc fails for LBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_BC[].");	
+		ABORT("Malloc fails for SeedSTD_BC[].");
+
 
-		
 	for (i=0;i<k;i++){
-		SeedSTD_BC[i]=rand();		
+		SeedSTD_BC[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 	for (ljb = 0; ljb <k ; ++ljb) {
 		LBtree_ptr[ljb]=NULL;
-	}			
-	
+	}
+
 
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-		ABORT("Calloc fails for ActiveFlag[].");				
-	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;	
+		ABORT("Calloc fails for ActiveFlag[].");
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -951,10 +951,10 @@ zdistribute(fact_t fact, int_t n, SuperM
 			ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb);
 		} /* for j ... */
 		}
-	}			
-	
+	}
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
-		
+
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
 		pc = PCOL( jb, grid );
@@ -963,19 +963,19 @@ zdistribute(fact_t fact, int_t n, SuperM
 		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-		Root=-1; 
-		Iactive = 0;				
+		Root=-1;
+		Iactive = 0;
 		for (j=0;j<grid->nprow;++j){
 			if(ActiveFlag[j]!=3*nsupers){
 			gb = ActiveFlag[j];
 			pr = PROW( gb, grid );
 			if(gb==jb)Root=pr;
-			if(myrow==pr)Iactive=1;		
-			}					
+			if(myrow==pr)Iactive=1;
+			}
 		}
-		
 
-		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);	
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);
 
 		if(Iactive==1){
 			// printf("jb %5d damn\n",jb);
@@ -988,7 +988,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 					++rank_cnt;
 				}
-			}		
+			}
 
 			if(rank_cnt>1){
 
@@ -998,7 +998,7 @@ zdistribute(fact_t fact, int_t n, SuperM
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');  	
+				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
 				BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z');
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
@@ -1009,15 +1009,15 @@ zdistribute(fact_t fact, int_t n, SuperM
 				// fflush(stdout);
 				// }
 
-				// #if ( PRNTlevel>=1 )		
+				// #if ( PRNTlevel>=1 )
 				if(Root==myrow){
 					rank_cnt_ref=1;
 					for (j = 0; j < grid->nprow; ++j) {
-						if ( fsendx_plist[ljb][j] != EMPTY ) {	
-							++rank_cnt_ref;		
+						if ( fsendx_plist[ljb][j] != EMPTY ) {
+							++rank_cnt_ref;
 						}
 					}
-					assert(rank_cnt==rank_cnt_ref);		
+					assert(rank_cnt==rank_cnt_ref);
 
 					// printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt);
 
@@ -1026,27 +1026,27 @@ zdistribute(fact_t fact, int_t n, SuperM
 					// // printf("\n");
 				}
 				// #endif
-			}	
+			}
 		}
 		}
 	}
 
-	
+
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
 	SUPERLU_FREE(ranks);
 	SUPERLU_FREE(SeedSTD_BC);
-	
-	
+
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
-#endif			
+#endif
 
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif			
+#endif
 	/* construct the Reduce tree for L ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1075,24 +1075,24 @@ if ( !iam) printf(".. Construct Bcast tr
 	if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 		ABORT("Malloc fails for LRtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 
 	// if ( !(idxs = intCalloc_dist(nsupers)) )
-		// ABORT("Calloc fails for idxs[].");	
+		// ABORT("Calloc fails for idxs[].");
 
 	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 		// ABORT("Malloc fails for nzrows[].");
 
 	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_RD[].");	
+		ABORT("Malloc fails for SeedSTD_RD[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_RD[i]=rand();		
+		SeedSTD_RD[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 
 	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
@@ -1118,11 +1118,11 @@ if ( !iam) printf(".. Construct Bcast tr
 		LRtree_ptr[lib]=NULL;
 	}
 
-	
+
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-				
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
+
 	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		fsupc = FstBlockC( jb );
 		pc = PCOL( jb, grid );
@@ -1137,7 +1137,7 @@ if ( !iam) printf(".. Construct Bcast tr
 		}
 	}
 
-	
+
 	for (lib=0;lib<k;++lib){
 		ib = myrow+lib*grid->nprow;  /* not sure */
 		if(ib<nsupers){
@@ -1145,19 +1145,19 @@ if ( !iam) printf(".. Construct Bcast tr
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 
 			for (j=0;j<grid->npcol;++j){
 				if(ActiveFlag[j]!=-3*nsupers){
 				jb = ActiveFlag[j];
 				pc = PCOL( jb, grid );
 				if(jb==ib)Root=pc;
-				if(mycol==pc)Iactive=1;		
-				}					
+				if(mycol==pc)Iactive=1;
+				}
 			}
-		
-		
+
+
 			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2);
 
 			if(Iactive==1){
@@ -1173,7 +1173,7 @@ if ( !iam) printf(".. Construct Bcast tr
 				if(rank_cnt>1){
 
 					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-						ranks[ii] = PNUM( pr, ranks[ii], grid );		
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 					// rseed=rand();
 					// rseed=1.0;
@@ -1181,7 +1181,7 @@ if ( !iam) printf(".. Construct Bcast tr
 
 					// if(ib==0){
 
-					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');  	
+					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
 					RdTree_SetTag(LRtree_ptr[lib], RD_L,'z');
 					// }
 
@@ -1193,7 +1193,7 @@ if ( !iam) printf(".. Construct Bcast tr
 					// if(iam==15 || iam==3){
 					// printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'z'));
 					// fflush(stdout);
-					// }		
+					// }
 
 
 					// #if ( PRNTlevel>=1 )
@@ -1204,10 +1204,10 @@ if ( !iam) printf(".. Construct Bcast tr
 					// // // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 					// // printf("\n");
 					// }
-					// #endif		
+					// #endif
 				}
-			}				
-		}	
+			}
+		}
 	}
 
 	SUPERLU_FREE(mod_bit);
@@ -1216,9 +1216,9 @@ if ( !iam) printf(".. Construct Bcast tr
 
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);	
-	// SUPERLU_FREE(idxs);	 
-	SUPERLU_FREE(SeedSTD_RD);	
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
 	// for(i=0;i<nsupers;++i){
 		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 	// }
@@ -1229,11 +1229,11 @@ if ( !iam) printf(".. Construct Bcast tr
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 	t = SuperLU_timer_();
-#endif	
+#endif
 
 	/* construct the Bcast tree for U ... */
 
@@ -1241,27 +1241,27 @@ if ( !iam) printf(".. Construct Reduce t
 	if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
 		ABORT("Malloc fails for UBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_BC[].");	
+		ABORT("Malloc fails for SeedSTD_BC[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_BC[i]=rand();		
+		SeedSTD_BC[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 
 	for (ljb = 0; ljb <k ; ++ljb) {
 		UBtree_ptr[ljb]=NULL;
-	}	
+	}
 
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;	
-	
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -1278,21 +1278,21 @@ if ( !iam) printf(".. Construct Reduce t
 				pr = PROW( gb, grid );
 				ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb);
 			// printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers);
-			// fflush(stdout);								
+			// fflush(stdout);
 				//if(gb==jb)Root=pr;
 			}
-			
-			
+
+
 		}
 		pr = PROW( jb, grid ); // take care of diagonal node stored as L
 		// printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]);
 		// fflush(stdout);
-		ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb);	
+		ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb);
 		}
-	}	
-		
-		
-		
+	}
+
+
+
 	for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
 		if(jb<nsupers){
@@ -1303,18 +1303,18 @@ if ( !iam) printf(".. Construct Reduce t
 		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
 		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
 
-		Root=-1; 
-		Iactive = 0;				
+		Root=-1;
+		Iactive = 0;
 		for (j=0;j<grid->nprow;++j){
 			if(ActiveFlag[j]!=-3*nsupers){
 			gb = ActiveFlag[j];
 			pr = PROW( gb, grid );
 			if(gb==jb)Root=pr;
-			if(myrow==pr)Iactive=1;		
+			if(myrow==pr)Iactive=1;
 			}
-		}						
-		
-		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);	
+		}
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);
 	// printf("jb: %5d Iactive %5d\n",jb,Iactive);
 	// fflush(stdout);
 		if(Iactive==1){
@@ -1328,7 +1328,7 @@ if ( !iam) printf(".. Construct Reduce t
 					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
 					++rank_cnt;
 				}
-			}		
+			}
 	// printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt);
 	// fflush(stdout);
 			if(rank_cnt>1){
@@ -1338,42 +1338,42 @@ if ( !iam) printf(".. Construct Reduce t
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');  	
+				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
 				BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z');
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 				// fflush(stdout);
-				
+
 				if(Root==myrow){
 				rank_cnt_ref=1;
 				for (j = 0; j < grid->nprow; ++j) {
 					// printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow);
 					// fflush(stdout);
-					if ( bsendx_plist[ljb][j] != EMPTY ) {	
-						++rank_cnt_ref;		
+					if ( bsendx_plist[ljb][j] != EMPTY ) {
+						++rank_cnt_ref;
 					}
 				}
 				// printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref);
-				// fflush(stdout);								
-				assert(rank_cnt==rank_cnt_ref);		
-				}						
+				// fflush(stdout);
+				assert(rank_cnt==rank_cnt_ref);
+				}
 			}
 		}
 		}
-	}	
+	}
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);				
-	SUPERLU_FREE(SeedSTD_BC);				
-		
+	SUPERLU_FREE(ranks);
+	SUPERLU_FREE(SeedSTD_BC);
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
-#endif					
+#endif
 
 #if ( PROFlevel>=1 )
 		t = SuperLU_timer_();
-#endif					
+#endif
 	/* construct the Reduce tree for U ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1402,46 +1402,46 @@ if ( !iam) printf(".. Construct Bcast tr
 	if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
 		ABORT("Malloc fails for URtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
-		ABORT("Calloc fails for ActiveFlag[].");	
+		ABORT("Calloc fails for ActiveFlag[].");
 	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
-		ABORT("Malloc fails for ranks[].");	
+		ABORT("Malloc fails for ranks[].");
 
 	// if ( !(idxs = intCalloc_dist(nsupers)) )
-		// ABORT("Calloc fails for idxs[].");	
+		// ABORT("Calloc fails for idxs[].");
 
 	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
 		// ABORT("Malloc fails for nzrows[].");
 
 	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
-		ABORT("Malloc fails for SeedSTD_RD[].");	
+		ABORT("Malloc fails for SeedSTD_RD[].");
 
 	for (i=0;i<k;i++){
-		SeedSTD_RD[i]=rand();		
+		SeedSTD_RD[i]=rand();
 	}
 
-	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);					  
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 
 	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		// fsupc = FstBlockC( jb );
-		// len=0;  
+		// len=0;
 		// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 			// istart = xusub[j];
 			// /* NOTE: Only the first nonzero index of the segment
 			   // is stored in usub[]. */
-			// len +=  xusub[j+1] - xusub[j];  
-		// }	
-				
+			// len +=  xusub[j+1] - xusub[j];
+		// }
+
 		// idxs[jb] = len-1;
 
 		// if(len>0){
 			// if ( !(nzrows[jb] = intMalloc_dist(len)) )
 				// ABORT("Malloc fails for nzrows[jb]");
-			
+
 			// fsupc = FstBlockC( jb );
-			
-			// len=0; 
-			
+
+			// len=0;
+
 			// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 				// istart = xusub[j];
 				// /* NOTE: Only the first nonzero index of the segment
@@ -1451,28 +1451,28 @@ if ( !iam) printf(".. Construct Bcast tr
 					// nzrows[jb][len]=irow;
 					// len++;
 				// }
-			// }	
+			// }
 			// quickSort(nzrows[jb],0,len-1,0);
 		// }
 		// else{
 			// nzrows[jb] = NULL;
 		// }
 	// }
-	
+
 
 	for (lib = 0; lib <k ; ++lib) {
 		URtree_ptr[lib]=NULL;
 	}
 
-	
+
 	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
-		ABORT("Calloc fails for ActiveFlagAll[].");				
-	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;	
-				
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;
+
 	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		fsupc = FstBlockC( jb );
 		pc = PCOL( jb, grid );
-		
+
 		fsupc = FstBlockC( jb );
 		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
 			istart = xusub[j];
@@ -1485,17 +1485,17 @@ if ( !iam) printf(".. Construct Bcast tr
 				if ( myrow == pr ) { /* Block row ib in my process row */
 					lib = LBi( ib, grid ); /* Local block number */
 					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
-				}						
+				}
 			}
 		}
-		
+
 		pr = PROW( jb, grid );
 		if ( myrow == pr ) { /* Block row ib in my process row */
 			lib = LBi( jb, grid ); /* Local block number */
 			ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
-		}					
+		}
 	}
-		
+
 
 	for (lib=0;lib<k;++lib){
 		ib = myrow+lib*grid->nprow;  /* not sure */
@@ -1504,18 +1504,18 @@ if ( !iam) printf(".. Construct Bcast tr
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
 			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
 			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
-			Root=-1; 
-			Iactive = 0;				
+			Root=-1;
+			Iactive = 0;
 
 			for (j=0;j<grid->npcol;++j){
 				if(ActiveFlag[j]!=3*nsupers){
 				jb = ActiveFlag[j];
 				pc = PCOL( jb, grid );
 				if(jb==ib)Root=pc;
-				if(mycol==pc)Iactive=1;		
-				}					
+				if(mycol==pc)Iactive=1;
+				}
 			}
-			
+
 			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2);
 
 			if(Iactive==1){
@@ -1531,7 +1531,7 @@ if ( !iam) printf(".. Construct Bcast tr
 				if(rank_cnt>1){
 
 					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
-						ranks[ii] = PNUM( pr, ranks[ii], grid );		
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
 
 					// rseed=rand();
 					// rseed=1.0;
@@ -1539,7 +1539,7 @@ if ( !iam) printf(".. Construct Bcast tr
 
 					// if(ib==0){
 
-					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');  	
+					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
 					RdTree_SetTag(URtree_ptr[lib], RD_U,'z');
 					// }
 
@@ -1553,10 +1553,10 @@ if ( !iam) printf(".. Construct Bcast tr
 					// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
 					// printf("\n");
 					}
-					// #endif		
+					// #endif
 				}
 			}
-		}						
+		}
 	}
 	SUPERLU_FREE(mod_bit);
 	SUPERLU_FREE(brecv);
@@ -1564,24 +1564,24 @@ if ( !iam) printf(".. Construct Bcast tr
 
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
-	SUPERLU_FREE(ranks);	
-	// SUPERLU_FREE(idxs);	
-	SUPERLU_FREE(SeedSTD_RD);	
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
 	// for(i=0;i<nsupers;++i){
 		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
 	// }
-	// SUPERLU_FREE(nzrows);				
-		
+	// SUPERLU_FREE(nzrows);
+
 #if ( PROFlevel>=1 )
 t = SuperLU_timer_() - t;
 if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
-#endif						
-		
+#endif
+
 	////////////////////////////////////////////////////////
-	
-	
+
+
 	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;  
+	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
 	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
 	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
 	Llu->Unzval_br_ptr = Unzval_br_ptr;
@@ -1604,11 +1604,11 @@ if ( !iam) printf(".. Construct Reduce t
 	Llu->URtree_ptr = URtree_ptr;
 	Llu->UBtree_ptr = UBtree_ptr;
 	Llu->Linv_bc_ptr = Linv_bc_ptr;
-	Llu->Uinv_bc_ptr = Uinv_bc_ptr;	
-	Llu->Urbs = Urbs; 
-	Llu->Ucb_indptr = Ucb_indptr; 
-	Llu->Ucb_valptr = Ucb_valptr; 	
-	
+	Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+	Llu->Urbs = Urbs;
+	Llu->Ucb_indptr = Ucb_indptr;
+	Llu->Ucb_valptr = Ucb_valptr;
+
 #if ( PRNTlevel>=1 )
 	if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
 			   nLblocks, nUblocks);
@@ -1629,8 +1629,7 @@ if ( !iam) printf(".. Construct Reduce t
 	    ABORT("Malloc fails for mod_bit[].");
 
 	/* Find the maximum buffer size. */
-	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		      MPI_MAX, grid->comm);
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm);
 
 #if ( PROFlevel>=1 )
 	if ( !iam ) printf(".. 1st distribute time:\n "
diff -pruN 6.1.0+dfsg1-1/SRC/zldperm_dist.c 6.1.1+dfsg1-1/SRC/zldperm_dist.c
--- 6.1.0+dfsg1-1/SRC/zldperm_dist.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zldperm_dist.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Finds a row permutation so that the matrix has large entries on the diagonal
  *
  * <pre>
@@ -43,7 +43,7 @@ extern void mc64ad_dist(int_t*, int_t*,
  *              permuted matrix has as many entries on its diagonal as
  *              possible. The values on the diagonal are of arbitrary size.
  *              HSL subroutine MC21A/AD is used for this.
- *        = 2 : Compute a row permutation of the matrix so that the smallest 
+ *        = 2 : Compute a row permutation of the matrix so that the smallest
  *              value on the diagonal of the permuted matrix is maximized.
  *        = 3 : Compute a row permutation of the matrix so that the smallest
  *              value on the diagonal of the permuted matrix is maximized.
@@ -53,9 +53,9 @@ extern void mc64ad_dist(int_t*, int_t*,
  *              of the diagonal entries of the permuted matrix is maximized.
  *        = 5 : Compute a row permutation of the matrix so that the product
  *              of the diagonal entries of the permuted matrix is maximized
- *              and vectors to scale the matrix so that the nonzero diagonal 
- *              entries of the permuted matrix are one in absolute value and 
- *              all the off-diagonal entries are less than or equal to one in 
+ *              and vectors to scale the matrix so that the nonzero diagonal
+ *              entries of the permuted matrix are one in absolute value and
+ *              all the off-diagonal entries are less than or equal to one in
  *              absolute value.
  *        Restriction: 1 <= JOB <= 5.
  *
@@ -82,10 +82,10 @@ extern void mc64ad_dist(int_t*, int_t*,
  *        original matrix is in row j of the permuted matrix.
  *
  * u      (output) double*, of size n
- *        If job = 5, the natural logarithms of the row scaling factors. 
+ *        If job = 5, the natural logarithms of the row scaling factors.
  *
  * v      (output) double*, of size n
- *        If job = 5, the natural logarithms of the column scaling factors. 
+ *        If job = 5, the natural logarithms of the column scaling factors.
  *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
  * </pre>
  */
@@ -93,7 +93,7 @@ extern void mc64ad_dist(int_t*, int_t*,
 int
 zldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[],
 	doublecomplex nzval[], int_t *perm, double u[], double v[])
-{ 
+{
     int_t i, liw, ldw, num;
     int_t *iw, icntl[10], info[10];
     double *dw;
@@ -107,7 +107,7 @@ zldperm_dist(int_t job, int_t n, int_t n
     if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]");
     ldw = 3*n + nnz;
     if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]");
-	    
+
     /* Increment one to get 1-based indexing. */
     for (i = 0; i <= n; ++i) ++colptr[i];
     for (i = 0; i < nnz; ++i) ++adjncy[i];
@@ -116,8 +116,8 @@ zldperm_dist(int_t job, int_t n, int_t n
     PrintInt10("colptr", n+1, colptr);
     PrintInt10("adjncy", nnz, adjncy);
 #endif
-	
-    /* 
+
+    /*
      * NOTE:
      * =====
      *
diff -pruN 6.1.0+dfsg1-1/SRC/zlook_ahead_update.c 6.1.1+dfsg1-1/SRC/zlook_ahead_update.c
--- 6.1.0+dfsg1-1/SRC/zlook_ahead_update.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zlook_ahead_update.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 /************************************************************************/
-/*! @file 
+/*! @file
  * \brief Look-ahead update of the Schur complement.
  *
  * <pre>
@@ -21,7 +21,7 @@ at the top-level directory.
  * Modified:
  *  September 18, 2017
  *  June 1, 2018  add parallel AWPM pivoting; add back arrive_at_ublock()
- *   
+ *
  */
 
 #include <assert.h>  /* assertion doesn't work if NDEBUG is defined */
@@ -139,7 +139,7 @@ while (j < nub && perm_u[2 * j] <= k0 +
             luptr += temp_nbrow;  /* move to next block */
         }
 
-#ifdef _OPENMP        
+#ifdef _OPENMP
         int_t thread_id = omp_get_thread_num ();
 #else
         int_t thread_id = 0;
@@ -147,7 +147,7 @@ while (j < nub && perm_u[2 * j] <= k0 +
         doublecomplex * tempv = bigV + ldt*ldt*thread_id;
 
         int *indirect_thread  = indirect + ldt * thread_id;
-        int *indirect2_thread = indirect2 + ldt * thread_id;        
+        int *indirect2_thread = indirect2 + ldt * thread_id;
         ib = lsub[lptr];        /* block number of L(i,k) */
         temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
 	/* assert (temp_nbrow <= nbrow); */
@@ -173,7 +173,7 @@ while (j < nub && perm_u[2 * j] <= k0 +
 	    tt_end = SuperLU_timer_();
 	    LookAheadGEMMTimer += tt_end - tt_start;
 	    tt_start = tt_end;
-	} 
+	}
 #endif
         /* Now scattering the output. */
         if (ib < jb) {    /* A(i,j) is in U. */
@@ -185,7 +185,7 @@ while (j < nub && perm_u[2 * j] <= k0 +
         } else {          /* A(i,j) is in L. */
             zscatter_l (ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
                        temp_nbrow, usub, lsub, tempv,
-                       indirect_thread, indirect2_thread, 
+                       indirect_thread, indirect2_thread,
                        Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
         }
 
@@ -228,7 +228,7 @@ while (j < nub && perm_u[2 * j] <= k0 +
         PZGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
                   U_diag_blk_send_req, tag_ub, stat, info);
 
-        pdgstrf2_timer += SuperLU_timer_() - tt1; 
+        pdgstrf2_timer += SuperLU_timer_() - tt1;
 
         /* stat->time7 += SuperLU_timer_() - ttt1; */
 
diff -pruN 6.1.0+dfsg1-1/SRC/zmemory_dist.c 6.1.1+dfsg1-1/SRC/zmemory_dist.c
--- 6.1.0+dfsg1-1/SRC/zmemory_dist.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zmemory_dist.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,9 +1,9 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
@@ -29,7 +29,7 @@ extern SuperLU_LU_stack_t stack;
 void *zuser_malloc_dist(int_t bytes, int_t which_end)
 {
     void *buf;
-    
+
     if ( SuperLU_StackFull(bytes) ) return (NULL);
 
     if ( which_end == HEAD ) {
@@ -39,7 +39,7 @@ void *zuser_malloc_dist(int_t bytes, int
 	stack.top2 -= bytes;
 	buf = (char*) stack.array + stack.top2;
     }
-    
+
     stack.used += bytes;
     return buf;
 }
@@ -154,7 +154,7 @@ zallocateA_dist(int_t n, int_t nnz, doub
 doublecomplex *doublecomplexMalloc_dist(int_t n)
 {
     doublecomplex *buf;
-    buf = (doublecomplex *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(doublecomplex) ); 
+    buf = (doublecomplex *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(doublecomplex) );
     return (buf);
 }
 
diff -pruN 6.1.0+dfsg1-1/SRC/zmyblas2_dist.c 6.1.1+dfsg1-1/SRC/zmyblas2_dist.c
--- 6.1.0+dfsg1-1/SRC/zmyblas2_dist.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zmyblas2_dist.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Level 2 BLAS operations: solves and matvec, written in C
  *
  * <pre>
@@ -31,8 +31,8 @@ at the top-level directory.
 /*! \brief
  *
  * <pre>
- * Solves a dense UNIT lower triangular system. The unit lower 
- * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
+ * Solves a dense UNIT lower triangular system. The unit lower
+ * triangular matrix is stored in a 2D array M(1:nrow,1:ncol).
  * The solution will be returned in the rhs vector.
  * </pre>
  */
@@ -71,7 +71,7 @@ void zlsolve ( int ldm, int ncol, double
       	rhs[++firstcol] = x2;
       	rhs[++firstcol] = x3;
       	++firstcol;
-    
+
       	for (k = firstcol; k < ncol; k++) {
 	    zz_mult(&temp, &x0, Mki0); Mki0++;
 	    z_sub(&rhs[k], &rhs[k], &temp);
@@ -96,15 +96,15 @@ void zlsolve ( int ldm, int ncol, double
 
       	rhs[++firstcol] = x1;
       	++firstcol;
-    
+
       	for (k = firstcol; k < ncol; k++) {
 	    zz_mult(&temp, &x0, Mki0); Mki0++;
 	    z_sub(&rhs[k], &rhs[k], &temp);
 	    zz_mult(&temp, &x1, Mki1); Mki1++;
 	    z_sub(&rhs[k], &rhs[k], &temp);
-	} 
+	}
     }
-    
+
 }
 
 /*! \brief
@@ -133,7 +133,7 @@ zusolve (
 	slud_z_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */
 
 	rhs[jcol] = xj;
-	
+
 	for (irow = 0; irow < jcol; irow++) {
 	    zz_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */
 	    z_sub(&rhs[irow], &rhs[irow], &temp);
@@ -151,11 +151,11 @@ zusolve (
  * <pre>
  * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
  * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
- * </pre> 
+ * </pre>
  */
 void zmatvec (
 	int ldm,	/* in -- leading dimension of M */
-	int nrow,	/* in */ 
+	int nrow,	/* in */
 	int ncol,	/* in */
 	doublecomplex *M,	/* in */
 	doublecomplex *vec,	/* in */
@@ -179,7 +179,7 @@ void zmatvec (
 	vi0 = vec[firstcol++];
 	vi1 = vec[firstcol++];
 	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
+	vi3 = vec[firstcol++];
 	for (k = 0; k < nrow; k++) {
 	    zz_mult(&temp, &vi0, Mki0); Mki0++;
 	    z_add(&Mxvec[k], &Mxvec[k], &temp);
@@ -203,6 +203,6 @@ void zmatvec (
 	}
 	M0 += ldm;
     }
-    return;	
+    return;
 }
 
diff -pruN 6.1.0+dfsg1-1/SRC/zreadhb.c 6.1.1+dfsg1-1/SRC/zreadhb.c
--- 6.1.0+dfsg1-1/SRC/zreadhb.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zreadhb.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format
  *
  * <pre>
@@ -37,67 +37,67 @@ static int ParseFloatFormat(char *, int_
  * <pre>
  * Purpose
  * =======
- * 
- * Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format 
+ *
+ * Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format
  * as described below.
- * 
- * Line 1 (A72,A8) 
- *  	Col. 1 - 72   Title (TITLE) 
- *	Col. 73 - 80  Key (KEY) 
- * 
- * Line 2 (5I14) 
- * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
- * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
- * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
- * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
- *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
- *                    (including starting guesses and solution vectors 
- *		       if present) 
- *           	      (zero indicates no right-hand side data is present) 
- *
- * Line 3 (A3, 11X, 4I14) 
- *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
- * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
- * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
- *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
- *	              (equal to number of entries for assembled matrices) 
- * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
- *	              (zero in the case of assembled matrices) 
- * Line 4 (2A16, 2A20) 
- * 	Col. 1 - 16   Format for pointers (PTRFMT) 
- *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
- *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
- * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
- *
- * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
- *    	Col. 1 	      Right-hand side type: 
- *	         	  F for full storage or M for same format as matrix 
- *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
- *    	Col. 3        X if an exact solution vector(s) is supplied. 
- *	Col. 15 - 28  Number of right-hand sides (NRHS) 
- *	Col. 29 - 42  Number of row indices (NRHSIX) 
- *          	      (ignored in case of unassembled matrices) 
- *
- * The three character type field on line 3 describes the matrix type. 
- * The following table lists the permitted values for each of the three 
- * characters. As an example of the type field, RSA denotes that the matrix 
- * is real, symmetric, and assembled. 
- *
- * First Character: 
- *	R Real matrix 
- *	C Complex matrix 
- *	P Pattern only (no numerical values supplied) 
- *
- * Second Character: 
- *	S Symmetric 
- *	U Unsymmetric 
- *	H Hermitian 
- *	Z Skew symmetric 
- *	R Rectangular 
- *
- * Third Character: 
- *	A Assembled 
- *	E Elemental matrices (unassembled) 
+ *
+ * Line 1 (A72,A8)
+ *  	Col. 1 - 72   Title (TITLE)
+ *	Col. 73 - 80  Key (KEY)
+ *
+ * Line 2 (5I14)
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD)
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD)
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD)
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD)
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD)
+ *                    (including starting guesses and solution vectors
+ *		       if present)
+ *           	      (zero indicates no right-hand side data is present)
+ *
+ * Line 3 (A3, 11X, 4I14)
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE)
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW)
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL)
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO)
+ *	              (equal to number of entries for assembled matrices)
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL)
+ *	              (zero in the case of assembled matrices)
+ * Line 4 (2A16, 2A20)
+ * 	Col. 1 - 16   Format for pointers (PTRFMT)
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT)
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT)
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT)
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present
+ *    	Col. 1 	      Right-hand side type:
+ *	         	  F for full storage or M for same format as matrix
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP)
+ *    	Col. 3        X if an exact solution vector(s) is supplied.
+ *	Col. 15 - 28  Number of right-hand sides (NRHS)
+ *	Col. 29 - 42  Number of row indices (NRHSIX)
+ *          	      (ignored in case of unassembled matrices)
+ *
+ * The three character type field on line 3 describes the matrix type.
+ * The following table lists the permitted values for each of the three
+ * characters. As an example of the type field, RSA denotes that the matrix
+ * is real, symmetric, and assembled.
+ *
+ * First Character:
+ *	R Real matrix
+ *	C Complex matrix
+ *	P Pattern only (no numerical values supplied)
+ *
+ * Second Character:
+ *	S Symmetric
+ *	U Unsymmetric
+ *	H Hermitian
+ *	Z Skew symmetric
+ *	R Rectangular
+ *
+ * Third Character:
+ *	A Assembled
+ *	E Elemental matrices (unassembled)
  * </pre>
  */
 
@@ -133,12 +133,12 @@ zreadhb_dist(int iam, FILE *fp, int_t *n
 #if ( DEBUGlevel>=1 )
     if ( !iam ) printf("Matrix type %s\n", type);
 #endif
-    
-    fscanf(fp, "%14c", buf); *nrow = atoi(buf); 
-    fscanf(fp, "%14c", buf); *ncol = atoi(buf); 
-    fscanf(fp, "%14c", buf); *nonz = atoi(buf); 
-    fscanf(fp, "%14c", buf); tmp = atoi(buf);   
-    
+
+    fscanf(fp, "%14c", buf); *nrow = atoi(buf);
+    fscanf(fp, "%14c", buf); *ncol = atoi(buf);
+    fscanf(fp, "%14c", buf); *nonz = atoi(buf);
+    fscanf(fp, "%14c", buf); tmp = atoi(buf);
+
     if (tmp != 0)
 	if ( !iam ) printf("This is not an assembled matrix!\n");
     if (*nrow != *ncol)
@@ -158,7 +158,7 @@ zreadhb_dist(int iam, FILE *fp, int_t *n
     fscanf(fp, "%20c", buf);
     DumpLine(fp);
 
-    /* Line 5: right-hand side */    
+    /* Line 5: right-hand side */
     if ( rhscrd ) DumpLine(fp); /* skip RHSFMT */
 
 #if ( DEBUGlevel>=1 )
@@ -169,7 +169,7 @@ zreadhb_dist(int iam, FILE *fp, int_t *n
 	printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize);
     }
 #endif
-    
+
     ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
 #if ( DEBUGlevel>=1 )
     if ( !iam )	printf("read colptr[" IFMT "] = " IFMT "\n", *ncol, (*colptr)[*ncol]);
@@ -202,20 +202,20 @@ static int ParseIntFormat(char *buf, int
 
     tmp = buf;
     while (*tmp++ != '(') ;
-    *num = atoi(tmp); 
+    *num = atoi(tmp);
     while (*tmp != 'I' && *tmp != 'i') ++tmp;
     ++tmp;
-    *size = atoi(tmp); 
+    *size = atoi(tmp);
     return 0;
 }
 
 static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
 {
     char *tmp, *period;
-    
+
     tmp = buf;
     while (*tmp++ != '(') ;
-    *num = atoi(tmp); 
+    *num = atoi(tmp);
     while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
 	   && *tmp != 'F' && *tmp != 'f') {
        /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
@@ -231,7 +231,7 @@ static int ParseFloatFormat(char *buf, i
     period = tmp;
     while (*period != '.' && *period != ')') ++period ;
     *period = '\0';
-    *size = atoi(tmp); 
+    *size = atoi(tmp);
 
     return 0;
 }
@@ -241,14 +241,14 @@ ReadVector(FILE *fp, int_t n, int_t *whe
 {
     register int_t i, j, item;
     char tmp, buf[100];
-    
+
     i = 0;
     while (i < n) {
 	fgets(buf, 100, fp);    /* read a line at a time */
 	for (j=0; j<perline && i<n; j++) {
 	    tmp = buf[(j+1)*persize];     /* save the char at that place */
 	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    item = atoi(&buf[j*persize]); 
+	    item = atoi(&buf[j*persize]);
 	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
 	    where[i++] = item - 1;
 	}
@@ -257,14 +257,14 @@ ReadVector(FILE *fp, int_t n, int_t *whe
 
 /* Read complex numbers as pairs of (real, imaginary) */
 void
-zReadValues(FILE *fp, int_t n, doublecomplex *destination, 
+zReadValues(FILE *fp, int_t n, doublecomplex *destination,
              int_t perline, int_t persize)
 {
     register int_t i, j, k, s;
     register int_t pair;
     register double realpart;
     char tmp, buf[100];
-    
+
     i = 0;
     pair = 0;
     while (i < n) {
diff -pruN 6.1.0+dfsg1-1/SRC/zreadMM.c 6.1.1+dfsg1-1/SRC/zreadMM.c
--- 6.1.0+dfsg1-1/SRC/zreadMM.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zreadMM.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,17 +1,17 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
 
-/*! @file 
- * \brief 
+/*! @file
+ * \brief
  * Contributed by Francois-Henry Rouet.
  *
  */
@@ -54,7 +54,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *
      *    Triplet in the rest of lines: row    col    value
      */
 
-     /* 1/ read header */ 
+     /* 1/ read header */
      cs = fgets(line,512,fp);
      for (p=line; *p!='\0'; *p=tolower(*p),p++);
 
@@ -62,7 +62,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *
        printf("Invalid header (first line does not contain 5 tokens)\n");
        exit;
      }
- 
+
      if(strcmp(banner,"%%matrixmarket")) {
        printf("Invalid header (first token is not \"%%%%MatrixMarket\")\n");
        exit(-1);
@@ -164,7 +164,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *
 
 	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
 	    /*|| val[nz] == 0.*/) {
-	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = {%e\t%e} out of bound, removed\n", 
+	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = {%e\t%e} out of bound, removed\n",
 		    nz, row[nz], col[nz], val[nz].r, val[nz].i);
 	    exit(-1);
 	} else {
@@ -177,7 +177,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *
 	          val[nz] = val[nz-1];
 	          ++xa[col[nz]];
 	        }
-            }	
+            }
 	    ++nz;
 	}
     }
@@ -187,7 +187,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *
       printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz);
       fflush(stdout);
     }
-    
+
 
     /* Initialize the array of column pointers */
     k = 0;
@@ -198,7 +198,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *
 	jsize = xa[j];
 	xa[j] = k;
     }
-    
+
     /* Copy the triplets into the column oriented storage */
     for (nz = 0; nz < *nonz; ++nz) {
 	j = col[nz];
diff -pruN 6.1.0+dfsg1-1/SRC/zreadrb.c 6.1.1+dfsg1-1/SRC/zreadrb.c
--- 6.1.0+dfsg1-1/SRC/zreadrb.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zreadrb.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,9 +1,9 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
@@ -22,7 +22,7 @@ at the top-level directory.
  * Purpose
  * =======
  *
- * Read a DOUBLE COMPLEX PRECISION matrix stored in Rutherford-Boeing format 
+ * Read a DOUBLE COMPLEX PRECISION matrix stored in Rutherford-Boeing format
  * as described below.
  *
  * Line 1 (A72, A8)
@@ -143,7 +143,7 @@ static int ReadVector(FILE *fp, int_t n,
         for (j=0; j<perline && i<n; j++) {
             tmp = buf[(j+1)*persize];     /* save the char at that place */
             buf[(j+1)*persize] = 0;       /* null terminate */
-            item = atoi(&buf[j*persize]); 
+            item = atoi(&buf[j*persize]);
             buf[(j+1)*persize] = tmp;     /* recover the char at that place */
             where[i++] = item - 1;
         }
@@ -158,7 +158,7 @@ static int zReadValues(FILE *fp, int n,
     register int i, j, k, s, pair;
     register double realpart;
     char tmp, buf[100];
-    
+
     i = pair = 0;
     while (i < n) {
 	fgets(buf, 100, fp);    /* read a line at a time */
@@ -242,7 +242,7 @@ FormFullA(int_t n, int_t *nonz, doubleco
 	ABORT("SUPERLU_MALLOC fails for a_rowind[]");
     if ( !(a_val = (doublecomplex*) SUPERLU_MALLOC( new_nnz * sizeof(doublecomplex)) ) )
 	ABORT("SUPERLU_MALLOC fails for a_val[]");
-    
+
     a_colptr[0] = 0;
     k = 0;
     for (j = 0; j < n; ++j) {
@@ -259,7 +259,7 @@ FormFullA(int_t n, int_t *nonz, doubleco
 	a_val[k] = al_val[i];
 	++k;
       }
-      
+
       a_colptr[j+1] = k;
     }
 
diff -pruN 6.1.0+dfsg1-1/SRC/zreadtriple.c 6.1.1+dfsg1-1/SRC/zreadtriple.c
--- 6.1.0+dfsg1-1/SRC/zreadtriple.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zreadtriple.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
- * \brief 
+/*! @file
+ * \brief
  *
  */
 #include <stdio.h>
@@ -38,7 +38,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int
     doublecomplex *a, *val;
     int_t    *asub, *xa, *row, *col;
     int_t    zero_base = 0;
-    
+
     /* 	File format:
      *    First line:  #rows    #non-zero
      *    Triplet in the rest of lines:
@@ -95,7 +95,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int
 
 	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
 	    /*|| val[nz] == 0.*/) {
-	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = {%e\t%e} out of bound, removed\n", 
+	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = {%e\t%e} out of bound, removed\n",
 		    nz, row[nz], col[nz], val[nz].r, val[nz].i);
 	    exit(-1);
 	} else {
@@ -108,7 +108,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int
 	      val[nz] = val[nz-1];
 	      ++xa[col[nz]];
 	    }
-#endif	
+#endif
 	    ++nz;
 	}
     }
@@ -117,7 +117,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int
 #ifdef EXPAND_SYM
     printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
 #endif
-    
+
 
     /* Initialize the array of column pointers */
     k = 0;
@@ -128,7 +128,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int
 	jsize = xa[j];
 	xa[j] = k;
     }
-    
+
     /* Copy the triplets into the column oriented storage */
     for (nz = 0; nz < *nonz; ++nz) {
 	j = col[nz];
diff -pruN 6.1.0+dfsg1-1/SRC/zreadtriple_noheader.c 6.1.1+dfsg1-1/SRC/zreadtriple_noheader.c
--- 6.1.0+dfsg1-1/SRC/zreadtriple_noheader.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zreadtriple_noheader.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,16 +1,16 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
- * \brief 
+/*! @file
+ * \brief
  *
  */
 #include <stdio.h>
@@ -66,7 +66,7 @@ zreadtriple_noheader(FILE *fp, int_t *m,
         ret_val = fscanf(fp, "%d%d%lf%lf\n", &i, &j, &vali.r, &vali.i);
 #endif
     }
-    
+
     if ( minn == 0 ) { /* zero-based indexing */
 	zero_base = 1;
 	++(*n);
@@ -117,7 +117,7 @@ zreadtriple_noheader(FILE *fp, int_t *m,
 
 	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
 	    /*|| val[nz] == 0.*/) {
-	    fprintf(stderr, "nz %d, (%d, %d) = %e out of bound, removed\n", 
+	    fprintf(stderr, "nz %d, (%d, %d) = %e out of bound, removed\n",
 		    nz, row[nz], col[nz], val[nz]);
 	    exit(-1);
 	} else {
@@ -130,7 +130,7 @@ zreadtriple_noheader(FILE *fp, int_t *m,
 	      val[nz] = val[nz-1];
 	      ++xa[col[nz]];
 	    }
-#endif	
+#endif
 	    ++nz;
 	}
     }
@@ -139,7 +139,7 @@ zreadtriple_noheader(FILE *fp, int_t *m,
 #ifdef EXPAND_SYM
     printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
 #endif
-    
+
 
     /* Initialize the array of column pointers */
     k = 0;
@@ -150,7 +150,7 @@ zreadtriple_noheader(FILE *fp, int_t *m,
 	jsize = xa[j];
 	xa[j] = k;
     }
-    
+
     /* Copy the triplets into the column oriented storage */
     for (nz = 0; nz < *nonz; ++nz) {
 	j = col[nz];
diff -pruN 6.1.0+dfsg1-1/SRC/zscatter.c 6.1.1+dfsg1-1/SRC/zscatter.c
--- 6.1.0+dfsg1-1/SRC/zscatter.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zscatter.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,25 +1,25 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Scatter the computed blocks into LU destination.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 5.2) --
+ * -- Distributed SuperLU routine (version 6.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 1, 2014
  *
- * Modified: 
+ * Modified:
  *   September 18, 2017, enable SIMD vectorized scatter operation.
- *   
+ *
  */
 #include <math.h>
 #include "superlu_zdefs.h"
@@ -125,7 +125,7 @@ zscatter_l (
            int_t ** Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
            gridinfo_t * grid)
 {
-    
+
     int_t rel, i, segsize, jj;
     doublecomplex *nzval;
     int_t *index = Lrowind_bc_ptr[ljb];
@@ -133,23 +133,23 @@ zscatter_l (
     int_t lptrj = BC_HEADER;
     int_t luptrj = 0;
     int_t ijb = index[lptrj];
-    
+
     while (ijb != ib)  /* Search for destination block L(i,j) */
     {
         luptrj += index[lptrj + 1];
         lptrj += LB_DESCRIPTOR + index[lptrj + 1];
         ijb = index[lptrj];
     }
-    
+
     /*
      * Build indirect table. This is needed because the indices are not sorted
      * in the L blocks.
      */
     int_t fnz = FstBlockC (ib);
-    int_t dest_nbrow; 
+    int_t dest_nbrow;
     lptrj += LB_DESCRIPTOR;
     dest_nbrow=index[lptrj - 1];
-    
+
 #if (_OPENMP>=201307)
 #pragma omp simd
 #endif
@@ -165,7 +165,7 @@ zscatter_l (
     /* can be precalculated? */
     for (i = 0; i < temp_nbrow; ++i) { /* Source index is a subset of dest. */
         rel = lsub[lptr + i] - fnz;
-        indirect2[i] =indirect_thread[rel]; 
+        indirect2[i] =indirect_thread[rel];
     }
 
     nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Destination block L(i,j) */
@@ -185,7 +185,7 @@ zscatter_l (
         }
         nzval += ldv;
     }
-    
+
 } /* zscatter_l */
 
 
@@ -299,12 +299,12 @@ gemm_division_cpu_gpu(
     /*input */
     int nbrow,              /*number of row in A matrix */
     int ldu,                /*number of k in dgemm */
-    int nstreams, 
+    int nstreams,
     int* full_u_cols,       /*array containing prefix sum of work load */
     int num_blks            /*Number of work load */
 )
 {
-    int Ngem = sp_ienv(7);  /*get_mnk_dgemm ();*/
+    int Ngem = sp_ienv_dist(7);  /*get_mnk_dgemm ();*/
     int min_gpu_col = get_cublas_nb ();
 
     // Ngem = 1000000000;
@@ -312,7 +312,7 @@ gemm_division_cpu_gpu(
        cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
        However since there is gpu latency of around 20,000 ns implying about
        200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
-       should be done in cpu to hide the latency; we Ngem =200,000/2 
+       should be done in cpu to hide the latency; we Ngem =200,000/2
      */
     int i, j;
 
@@ -431,7 +431,7 @@ gemm_division_new (int * num_streams_use
                    int num_blks  /*Number of work load */
     )
 {
-    int Ngem = sp_ienv(7); /*get_mnk_dgemm ();*/
+    int Ngem = sp_ienv_dist(7); /*get_mnk_dgemm ();*/
     int min_gpu_col = get_cublas_nb ();
 
     // Ngem = 1000000000;
@@ -439,7 +439,7 @@ gemm_division_new (int * num_streams_use
        cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
        However since there is gpu latency of around 20,000 ns implying about
        200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
-       should be done in cpu to hide the latency; we Ngem =200,000/2 
+       should be done in cpu to hide the latency; we Ngem =200,000/2
      */
     int_t i, j;
 
diff -pruN 6.1.0+dfsg1-1/SRC/zSchCompUdt-2Ddynamic.c 6.1.1+dfsg1-1/SRC/zSchCompUdt-2Ddynamic.c
--- 6.1.0+dfsg1-1/SRC/zSchCompUdt-2Ddynamic.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zSchCompUdt-2Ddynamic.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief This file contains the main loop of pdgstrf which involves rank k
  *        update of the Schur complement.
  *        Uses 2D partitioning for the scatter phase.
@@ -21,7 +21,7 @@ at the top-level directory.
  *
  * Modified:
  *   September 14, 2017
- *   - First gather U-panel, then depending on "ldu" (excluding leading zeros), 
+ *   - First gather U-panel, then depending on "ldu" (excluding leading zeros),
  *     gather only trailing columns of the L-panel corresponding to the nonzero
  *     of U-rows.
  *   - Padding zeros for nice dimensions of GEMM.
@@ -29,9 +29,9 @@ at the top-level directory.
  *  June 1, 2018  add parallel AWPM pivoting; add back arrive_at_ublock()
  */
 
-#define SCHEDULE_STRATEGY guided 
+#define SCHEDULE_STRATEGY guided
 
-/* 
+/*
  * Buffers:
  *     [ lookAhead_L_buff | Remain_L_buff ] : stores the gathered L-panel
  *                                            (A matrix in C := A*B )
@@ -57,17 +57,17 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
      tt_start = SuperLU_timer_();
 
      /* Sherry -- can this loop be threaded?? */
-     /* Loop through all blocks in L(:,k) to set up pointers to the start 
+     /* Loop through all blocks in L(:,k) to set up pointers to the start
       * of each block in the data arrays.
       *   - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
       *   - lookAheadStRow[i] := number of nonzero rows before block i
-      *   - lookAhead_lptr[i] := point to the start of block i in L's index[] 
+      *   - lookAhead_lptr[i] := point to the start of block i in L's index[]
       *   - (ditto Remain_Info[i])
       */
      for (int i = 0; i < nlb; ++i) {
 	 ib = lsub[lptr];            /* Block number of L(i,k). */
 	 temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
-        
+
 	 int look_up_flag = 1; /* assume ib is outside look-up window */
 	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers );
 	      ++j) {
@@ -76,35 +76,35 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
                      break;            /* Sherry -- can exit the loop?? */
                  }
 	 }
-	 
+
 	 if ( look_up_flag == 0 ) { /* ib is within look-up window */
 	     if (lookAheadBlk==0) {
 		 lookAheadFullRow[lookAheadBlk] = temp_nbrow;
 	     } else {
-		 lookAheadFullRow[lookAheadBlk] = 
-		     temp_nbrow + lookAheadFullRow[lookAheadBlk-1];   
+		 lookAheadFullRow[lookAheadBlk] =
+		     temp_nbrow + lookAheadFullRow[lookAheadBlk-1];
 	     }
 	     lookAheadStRow[lookAheadBlk] = cum_nrow;
 	     lookAhead_lptr[lookAheadBlk] = lptr;
-	     lookAhead_ib[lookAheadBlk] = ib; 
+	     lookAhead_ib[lookAheadBlk] = ib;
 	     lookAheadBlk++;
 	 } else { /* ib is not in look-up window */
 	     if ( RemainBlk==0 ) {
 		 Remain_info[RemainBlk].FullRow = temp_nbrow;
 	     } else {
-		 Remain_info[RemainBlk].FullRow = 
-		     temp_nbrow + Remain_info[RemainBlk-1].FullRow;   
+		 Remain_info[RemainBlk].FullRow =
+		     temp_nbrow + Remain_info[RemainBlk-1].FullRow;
 	     }
              RemainStRow[RemainBlk] = cum_nrow;
              // Remain_lptr[RemainBlk] = lptr;
 	     Remain_info[RemainBlk].lptr = lptr;
-	     // Remain_ib[RemainBlk] = ib; 
-	     Remain_info[RemainBlk].ib = ib; 
+	     // Remain_ib[RemainBlk] = ib;
+	     Remain_info[RemainBlk].ib = ib;
 	     RemainBlk++;
 	 }
-	 
+
          cum_nrow += temp_nbrow;
-	 
+
 	 lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
 	 lptr += temp_nbrow;     /* Move to next block */
 	 luptr += temp_nbrow;
@@ -139,7 +139,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	 ncols = 0; /* Total number of nonzero columns in U(k,:) */
 	 int temp_ncols = 0;
 
-	 /* jj0 contains the look-ahead window that was updated in 
+	 /* jj0 contains the look-ahead window that was updated in
 	    dlook_ahead_update.c. Now the search can continue from that point,
 	    not to start from block 0. */
 #if 0 // Sherry comment out 5/21/208
@@ -150,8 +150,8 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 #endif
 
 	 /* if ( iam==0 ) printf("--- k0 %d, k %d, jj0 %d, nub %d\n", k0, k, jj0, nub);*/
-	     
-         /* 
+
+         /*
 	  * Loop through all blocks in U(k,:) to set up pointers to the start
           * of each block in the data arrays, store them in Ublock_info[j]
           * for block U(k,j).
@@ -176,7 +176,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 
 	     /* if ( iam==0 )
 		 printf("j %d: Ublock_info[j].iukp %d, Ublock_info[j].rukp %d,"
-			"Ublock_info[j].jb %d, nsupc %d\n", 
+			"Ublock_info[j].jb %d, nsupc %d\n",
 			j, Ublock_info[j].iukp, Ublock_info[j].rukp,
 			Ublock_info[j].jb, nsupc); */
 
@@ -207,7 +207,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	 for ( j = jj0+1; j < nub; ++j) {
 	     Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
 	 }
-            
+
 	 /* Padding zeros to make {m,n,k} multiple of vector length. */
 	 jj = 8; //n;
 	 if (gemm_padding > 0 && Rnbrow > jj && ncols > jj && ldu > jj) {
@@ -216,11 +216,11 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	     //gemm_n_pad = ncols;
 	     //gemm_k_pad = ldu + (ldu % GEMM_PADLEN);
 	     gemm_k_pad = ldu;
-	     
+
 	     for (i = Rnbrow; i < gemm_m_pad; ++i)  // padding A matrix
 		 for (j = 0; j < gemm_k_pad; ++j)
 		     Remain_L_buff[i + j*gemm_m_pad] = zero;
-	     for (i = 0; i < Rnbrow; ++i)         
+	     for (i = 0; i < Rnbrow; ++i)
 		 for (j = ldu; j < gemm_k_pad; ++j)
 		     Remain_L_buff[i + j*gemm_m_pad] = zero;
 	     for (i = ldu; i < gemm_k_pad; ++i)     // padding B matrix
@@ -234,7 +234,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	     gemm_n_pad = ncols;
 	     gemm_k_pad = ldu;
 	 }
-     
+
 	 tempu = bigU; /* buffer the entire row block U(k,:) */
 
          /* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
@@ -260,7 +260,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	    jb = Ublock_info[j].jb;
 	    nsupc = SuperSize (jb );
 #endif
-            /* Copy from U(k,j) to tempu[], padding zeros.  */            
+            /* Copy from U(k,j) to tempu[], padding zeros.  */
             for (jj = iukp; jj < iukp+nsupc; ++jj) {
                 segsize = klst - usub[jj];
                 if ( segsize ) {
@@ -270,7 +270,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 #if (_OPENMP>=201307)
 #pragma omp simd
 #endif
-		    for (i = 0; i < segsize; ++i) 
+		    for (i = 0; i < segsize; ++i)
                     	tempu[i+lead_zero] = uval[rukp+i];
                     rukp += segsize;
                     tempu += gemm_k_pad;
@@ -309,12 +309,12 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	     StRowDest   = lookAheadFullRow[i-1];
 	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
 	 }
-	 
+
 	 int StRowSource = lookAheadStRow[i];
-	 
+
 	 /* Now copying one block into L lookahead buffer */
 	 /* #pragma omp parallel for (gives slow down) */
-	 // for (int j = 0; j < knsupc; ++j) { 
+	 // for (int j = 0; j < knsupc; ++j) {
 	 for (j = knsupc-ldu; j < knsupc; ++j) { /* skip leading columns
 						    corresponding to zero U rows */
 #if 1
@@ -385,7 +385,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
       * Perform GEMM (look-ahead L part, and remain L part) followed by Scatter
       *************************************************************************/
      tempu = bigU;  /* setting to the start of padded U(k,:) */
-    
+
      if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
 	 /***************************************************************
 	  * Updating blocks in look-ahead window of the LU(look-ahead-rows,:)
@@ -403,7 +403,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 #pragma omp parallel default (shared) private(thread_id)
 	 {
 	   thread_id = omp_get_thread_num();
- 
+
 	   /* Ideally, should organize the loop as:
 	      for (j = 0; j < nub; ++j) {
 	          for (lb = 0; lb < lookAheadBlk; ++lb) {
@@ -427,7 +427,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	   int* indirect_thread    = indirect;
 	   int* indirect2_thread   = indirect2;
 #endif
-	   /* Each thread is assigned one loop index ij, responsible for 
+	   /* Each thread is assigned one loop index ij, responsible for
 	      block update L(lb,k) * U(k,j) -> tempv[]. */
 	   for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
 	       /* jj0 starts after look-ahead window. */
@@ -448,7 +448,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
                 st_col = Ublock_info[j-1].full_u_cols;
             } else {
                 ncols  = Ublock_info[j].full_u_cols;
-                st_col = 0;   
+                st_col = 0;
             }
 
             /* Getting L block L(i,k) information */
@@ -473,7 +473,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	    gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
 #endif
 
-#if defined (USE_VENDOR_BLAS)            
+#if defined (USE_VENDOR_BLAS)
             zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
 		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
 		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
@@ -509,7 +509,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	    __itt_resume(); // start VTune, again use 2 underscores
 #endif
                 zscatter_l (
-				 ib, ljb, 
+				 ib, ljb,
 				 nsupc, iukp, xsup,
  				 klst, temp_nbrow,
 				 lptr, temp_nbrow,
@@ -526,7 +526,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
             }
 
 #if ( PRNTlevel>=1 )
-	    if (thread_id == 0) 
+	    if (thread_id == 0)
 		LookAheadScatterTimer += SuperLU_timer_() - tt_start;
 #endif
 	   } /* end omp for ij = ... */
@@ -596,7 +596,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 #pragma omp parallel default(shared) private(thread_id)
 	{
 	    thread_id = omp_get_thread_num();
- 
+
 	    /* Ideally, should organize the loop as:
                for (j = 0; j < jj_cpu; ++j) {
 	           for (lb = 0; lb < RemainBlk; ++lb) {
@@ -620,7 +620,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 	    int* indirect_thread = indirect;
 	    int* indirect2_thread = indirect2;
 #endif
-	    /* Each thread is assigned one loop index ij, responsible for 
+	    /* Each thread is assigned one loop index ij, responsible for
 	       block update L(lb,k) * U(k,j) -> tempv[]. */
 	    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) {
 		/* jj_cpu := nub, jj0 starts after look-ahead window. */
@@ -641,7 +641,7 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 		    st_col = Ublock_info[j-1].full_u_cols;
 		} else {
 		    ncols = Ublock_info[j].full_u_cols;
-		    st_col = 0;   
+		    st_col = 0;
 		}
 
 		/* Getting L block L(i,k) information */
@@ -650,9 +650,9 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 		int temp_nbrow = lsub[lptr+1];
 		lptr += LB_DESCRIPTOR;
 		int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
-		
+
 		/* tempv1 points to block(i,j) in bigV : LDA == Rnbrow */
-		//double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry 
+		//double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry
 		doublecomplex* tempv1 = bigV + (st_col * gemm_m_pad + cum_nrow); /* Sherry */
 
 		// printf("[%d] .. before scatter: ib %d, jb %d, temp_nbrow %d, Rnbrow %d\n", iam, ib, jb, temp_nbrow, Rnbrow); fflush(stdout);
@@ -683,13 +683,13 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,
 			       grid
 			       );
 		}
-		
+
 	    } /* end omp for (int ij =...) */
-	    
+
 #ifdef _OPENMP
 	} /* end omp parallel region */
 #endif
-	
+
 #if ( PRNTlevel>=1 )
 	RemainScatterTimer += SuperLU_timer_() - tt_start;
 #endif
diff -pruN 6.1.0+dfsg1-1/SRC/zSchCompUdt-cuda.c 6.1.1+dfsg1-1/SRC/zSchCompUdt-cuda.c
--- 6.1.0+dfsg1-1/SRC/zSchCompUdt-cuda.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zSchCompUdt-cuda.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief This file contains the main loop of pzgstrf which involves
  *        rank k update of the Schur complement.
  *        Uses CUDA GPU.
@@ -47,24 +47,24 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 
     lptr = lptr0;
     luptr = luptr0;
-    
+
     nbrow= lsub[1];
     if (myrow==krow) nbrow = lsub[1]-lsub[3];
 
     if (nbrow>0) {
-        
+
         int ncol_max = SUPERLU_MIN(buffer_size/nbrow,bigu_size/ldt);
         int num_streams_used,        /*number of streams that will be used*/
         ncpu_blks;                     /*Number of CPU dgemm blks*/
 
-        int jjj, jjj_st,jjj_global;        
+        int jjj, jjj_st,jjj_global;
         for (j = jj0; j < nub; ++j) {
             arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
 	    		      iukp0,rukp0,usub,perm_u,xsup,grid );
 
-            ncols =0 ;  //initialize at 0 
+            ncols =0 ;  //initialize at 0
             jj = iukp;
-            int temp_ldu=0; 
+            int temp_ldu=0;
             for (; jj < iukp+nsupc; ++jj) {
                 segsize = klst - usub[jj];
                 if ( segsize ) {
@@ -78,8 +78,8 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
         } /* end for j = jj0..nub */
 
         jjj = jj0; /* initialization */
-            
-        // #pragma omp barrier 
+
+        // #pragma omp barrier
         while ( jjj < nub ) {
             jjj_st=jjj;
 #ifdef _OPENMP
@@ -88,21 +88,21 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
             {
                 ldu = blk_ldu[jjj_st];
                 for (j = jjj_st; j < nub ; ++j) {
-                    
+
                     /* prefix sum */
                     if (j != jjj_st) full_u_cols[j] += full_u_cols[j-1];
 
-                    ldu = SUPERLU_MAX(ldu, blk_ldu[j]);   
+                    ldu = SUPERLU_MAX(ldu, blk_ldu[j]);
 
                     /* break condition */
                     /* the number of columns that can be processed is limited by buffer size*/
                     if (full_u_cols[j]+((j+1==nub)?0:full_u_cols[j+1]) > ncol_max) {
                         break;
                     }
-                } /* end for j=jjj_st to nub */  
+                } /* end for j=jjj_st to nub */
 
                 jjj_global = SUPERLU_MIN(nub, j+1); /* Maximum value of jjj will be nub */
-                
+
                 // TAU_STATIC_TIMER_START("work_divison");
                 /* Divide CPU-GPU gemm here */
                 gemm_division_cpu_gpu(
@@ -127,8 +127,8 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
                 if(nbrow * full_u_cols[jjj_st] > buffer_size)
                     printf("%d buffer_size %d\n",nbrow*full_u_cols[jjj_st],buffer_size );
             }
-            
-            // #pragma omp barrier 
+
+            // #pragma omp barrier
             /* gathering circuit */
             assert(jjj_st<nub);
             assert(jjj-1<nub);
@@ -160,25 +160,25 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 
                 rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
 
-            } /* end for j=jjj_st to jjj */  
+            } /* end for j=jjj_st to jjj */
 
 	    if ( num_streams_used > 0 ) {
 #ifdef PI_DEBUG
 		printf("nbrow %d *ldu %d  =%d < ldt %d * max_row_size %d =%d \n",nbrow,ldu,nbrow*ldu,ldt,max_row_size,ldt*max_row_size );
 		assert(nbrow*ldu<=ldt*max_row_size);
-#endif 
+#endif
 		cudaMemcpy2DAsync(dA, nbrow*sizeof(doublecomplex),
 				  &lusup[luptr+(knsupc-ldu)*nsupr],
 				  nsupr*sizeof(doublecomplex), nbrow*sizeof(doublecomplex),
 				  ldu, cudaMemcpyHostToDevice, streams[0]);
 	    }
-                
+
 	    for (int i = 0; i < num_streams_used; ++i) {
-		int st = (i==0) ? ncpu_blks+jjj_st : jjj_st+stream_end_col[i-1]; 
+		int st = (i==0) ? ncpu_blks+jjj_st : jjj_st+stream_end_col[i-1];
 		int st_col = full_u_cols[st-1];
 		int num_col_stream = full_u_cols[jjj_st+stream_end_col[i]-1]-full_u_cols[st-1];
 		tempu = bigU;
-                    
+
 		doublecomplex *tempv1 = bigV + full_u_cols[st-1]*nbrow;
 
 		/* Following is for testing purpose */
@@ -188,18 +188,18 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 		int c_offset  = st_col * nbrow;
 		size_t B_stream_size = ldu * num_col_stream * sizeof(doublecomplex);
 		size_t C_stream_size = nbrow * num_col_stream * sizeof(doublecomplex);
-		
+
 		assert(ldu*(st_col+num_col_stream) < bigu_size);
 		assert(nbrow*(st_col+num_col_stream) < buffer_size);
-		
+
 		cudaMemcpyAsync(dB+b_offset, tempu+b_offset, B_stream_size,
 				cudaMemcpyHostToDevice, streams[stream_id]);
-		
+
 		cublasCheckErrors(
 				  cublasSetStream(handle[stream_id],
 						  streams[stream_id])
 				  );
-		
+
 		cublasCheckErrors(
 				  cublasZgemm(handle[stream_id],
 					      CUBLAS_OP_N, CUBLAS_OP_N,
@@ -207,36 +207,36 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
  					      (const cuDoubleComplex*) &alpha,
 					      (const cuDoubleComplex*) dA,
 					      nbrow,
-					      (const cuDoubleComplex*) &dB[b_offset], 
+					      (const cuDoubleComplex*) &dB[b_offset],
 					      ldu,
 					      (const cuDoubleComplex*) &beta,
 					      (cuDoubleComplex*)&dC[c_offset],
                                               nbrow)
 				  );
-		
+
 		checkCuda( cudaMemcpyAsync(tempv1, dC+c_offset,
 					   C_stream_size,
 					   cudaMemcpyDeviceToHost,
 					   streams[stream_id]) );
-#else 
-		if ( num_col_stream > 0 ) {   
+#else
+		if ( num_col_stream > 0 ) {
 		    my_zgemm_("N", "N", &nbrow, &num_col_stream, &ldu,
 			      &alpha, &lusup[luptr+(knsupc-ldu)*nsupr],
 			      &nsupr, tempu+ldu*st_col, &ldu, &beta,
 			      tempv1, &nbrow, 1, 1);
 		}
-		
-#endif 
-		
+
+#endif
+
 	    } /* end for i = 1 to num_streams used */
-	    
+
 	    int num_col = full_u_cols[jjj_st+ncpu_blks-1];
 	    int st_col = 0;        /*special case for cpu */
 	    tempv = bigV + nbrow * st_col;
 	    tempu = bigU;
-	    
+
 	    double tstart = SuperLU_timer_();
-#if defined (USE_VENDOR_BLAS)            
+#if defined (USE_VENDOR_BLAS)
 	    zgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
 		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
 		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow, 1, 1);
@@ -247,12 +247,12 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 #endif
 	    gemm_timer += SuperLU_timer_() -tstart;
 	    stat->ops[FACT] += 2 * nbrow * ldu * full_u_cols[jjj-1];
-	    
+
 	    // printf("after zgemm \n");
-	    
+
             /* Now scattering blocks handled by cpu */
             int temp_ncol;
-	    
+
             /* scatter first blocks which cpu has computated*/
             tstart = SuperLU_timer_();
 
@@ -267,14 +267,14 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 #endif
             {
                 int thread_id = omp_get_thread_num();
-        
+
                 int* indirect_thread = indirect + ldt*thread_id;
                 int* indirect2_thread = indirect2 + ldt*thread_id;
                 doublecomplex* tempv1;
-                
+
                 if (ncpu_blks< omp_get_num_threads()) {
                     // TAU_STATIC_TIMER_START("SPECIAL_CPU_SCATTER");
-                    
+
                     for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
                         /* code */
                         #ifdef PI_DEBUG
@@ -341,7 +341,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
                                 printf("cpu scatter \n");
                                 printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
 #endif
-                                
+
                                 tempv = tempv1+cum_nrow;
 
                                 zscatter_l (
@@ -370,7 +370,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
                         /* code */
                         #ifdef PI_DEBUG
                             printf("scattering %d  block column\n",j);
-                        #endif 
+                        #endif
 
                         /* == processing each of the remaining columns == */
                         if(j==jjj_st) tempv1 = bigV;
@@ -394,7 +394,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 			    if(j==jjj_st) {
 				temp_ncol = full_u_cols[j];
 			    } else {
-				temp_ncol = full_u_cols[j]- full_u_cols[j-1];  
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];
 			    }
 			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
 #endif
@@ -435,7 +435,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 			    lptr += temp_nbrow;
 			    luptr += temp_nbrow;
 			    cum_nrow += temp_nbrow;
-			
+
 			} /* for lb ... */
 
 			luptr=luptr0;
@@ -443,7 +443,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 		}     /* else if (ncpu_blks >= omp_get_num_threads()) */
 	    }         /* parallel region */
 
-	    scatter_timer += SuperLU_timer_() - tstart; 
+	    scatter_timer += SuperLU_timer_() - tstart;
 #ifdef _OPENMP
 #pragma omp parallel							\
     private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,		\
@@ -455,7 +455,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 #endif
             {
                 int thread_id = omp_get_thread_num();
-        
+
                 int* indirect_thread = indirect + ldt*thread_id;
                 int* indirect2_thread = indirect2 + ldt*thread_id;
                 doublecomplex* tempv1;
@@ -467,12 +467,12 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
                     assert(jjj_st1>jjj_st) ;
 
                     /* now scatter it */
-#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait 
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait
                     for (j = jjj_st1; j < jjj_end; ++j) {
                         /* code */
 #ifdef PI_DEBUG
 			printf("scattering %d  block column\n",j);
-#endif 
+#endif
                         /* == processing each of the remaining columns == */
 
                         if(j==jjj_st) tempv1 = bigV;
@@ -495,7 +495,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
 			    if(j==jjj_st) {
 				temp_ncol = full_u_cols[j];
 			    } else {
-				temp_ncol = full_u_cols[j]- full_u_cols[j-1];  
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];
 			    }
 			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
 #endif
@@ -535,19 +535,19 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k
                             lptr += temp_nbrow;
                             luptr += temp_nbrow;
                             cum_nrow += temp_nbrow;
-			    
+
                         } /* for lb ... */
 
                         luptr=luptr0;
                     } /* for j = jjj_st ... */
-                    
+
                 } /* end for i = 0 to nstreams */
                 // TAU_STATIC_TIMER_STOP("GPU_SCATTER");
                 // TAU_STATIC_TIMER_STOP("INSIDE_OMP");
             } /* end pragma omp parallel */
             // TAU_STATIC_TIMER_STOP("OUTSIDE_OMP");
         }  /* end while(jjj<nub) */
- 
+
     } /* if nbrow>0 */
 
  }   /* if msg1 and msg 2 */
diff -pruN 6.1.0+dfsg1-1/SRC/zutil_dist.c 6.1.1+dfsg1-1/SRC/zutil_dist.c
--- 6.1.0+dfsg1-1/SRC/zutil_dist.c	2018-12-09 20:29:35.000000000 +0000
+++ 6.1.1+dfsg1-1/SRC/zutil_dist.c	2019-02-08 16:30:10.000000000 +0000
@@ -1,19 +1,19 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Several matrix utilities
  *
  * <pre>
- * -- Distributed SuperLU routine (version 2.0) --
+ * -- Distributed SuperLU routine (version 6.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
@@ -23,7 +23,7 @@ at the top-level directory.
 #include "superlu_zdefs.h"
 
 void
-zCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz, 
+zCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz,
 			    doublecomplex *nzval, int_t *rowind, int_t *colptr,
 			    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
 {
@@ -70,7 +70,7 @@ zCreate_CompRowLoc_Matrix_dist(SuperMatr
 /*! \brief Convert a row compressed storage into a column compressed storage.
  */
 void
-zCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz, 
+zCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz,
                          doublecomplex *a, int_t *colind, int_t *rowptr,
                          doublecomplex **at, int_t **rowind, int_t **colptr)
 {
@@ -82,7 +82,7 @@ zCompRow_to_CompCol_dist(int_t m, int_t
     *rowind = intMalloc_dist(nnz);
     *colptr = intMalloc_dist(n+1);
     marker = intCalloc_dist(n);
-    
+
     /* Get counts of each column of A, and set up column pointers */
     for (i = 0; i < m; ++i)
 	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
@@ -133,7 +133,7 @@ void zPrint_CompCol_Matrix_dist(SuperMat
     NCformat     *Astore;
     register int i;
     doublecomplex       *dp;
-    
+
     printf("\nCompCol matrix: ");
     printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
     Astore = (NCformat *) A->Store;
@@ -144,10 +144,10 @@ void zPrint_CompCol_Matrix_dist(SuperMat
         for (i = 0; i < Astore->nnz; ++i) printf("%f\t%f\n", dp[i].r, dp[i].i);
     }
     printf("\nrowind:\n");
-    for (i = 0; i < Astore->nnz; ++i) 
+    for (i = 0; i < Astore->nnz; ++i)
         printf("%lld  ", (long long) Astore->rowind[i]);
     printf("\ncolptr:\n");
-    for (i = 0; i <= A->ncol; ++i) 
+    for (i = 0; i <= A->ncol; ++i)
         printf("%lld  ", (long long) Astore->colptr[i]);
     printf("\nend CompCol matrix.\n");
 }
@@ -157,12 +157,12 @@ void zPrint_Dense_Matrix_dist(SuperMatri
     DNformat     *Astore;
     register int i;
     doublecomplex       *dp;
-    
+
     printf("\nDense matrix: ");
     printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
     Astore = (DNformat *) A->Store;
     dp = (doublecomplex *) Astore->nzval;
-    printf("nrow %lld, ncol %lld, lda %lld\n", 
+    printf("nrow %lld, ncol %lld, lda %lld\n",
         (long long) A->nrow, (long long) A->ncol, (long long) Astore->lda);
     printf("\nnzval: ");
     for (i = 0; i < A->nrow; ++i) printf("%f\t%f\n", dp[i].r, dp[i].i);
@@ -174,14 +174,14 @@ int zPrint_CompRowLoc_Matrix_dist(SuperM
     NRformat_loc  *Astore;
     int_t  nnz_loc, m_loc;
     doublecomplex  *dp;
-    
+
     printf("\n==== CompRowLoc matrix: ");
     printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
     Astore = (NRformat_loc *) A->Store;
-    printf("nrow %ld, ncol %ld\n", 
+    printf("nrow %ld, ncol %ld\n",
             (long int) A->nrow, (long int) A->ncol);
     nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
-    printf("nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc, 
+    printf("nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc,
             (long int) m_loc, (long int) Astore->fst_row);
     PrintInt10("rowptr", m_loc+1, Astore->rowptr);
     PrintInt10("colind", nnz_loc, Astore->colind);
@@ -196,7 +196,7 @@ int file_zPrint_CompRowLoc_Matrix_dist(F
     NRformat_loc     *Astore;
     int_t  nnz_loc, m_loc;
     doublecomplex       *dp;
-    
+
     fprintf(fp, "\n==== CompRowLoc matrix: ");
     fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
     Astore = (NRformat_loc *) A->Store;
@@ -218,7 +218,7 @@ zCreate_Dense_Matrix_dist(SuperMatrix *X
 			  Mtype_t mtype)
 {
     DNformat    *Xstore;
-    
+
     X->Stype = stype;
     X->Dtype = dtype;
     X->Mtype = mtype;
@@ -245,14 +245,14 @@ zCopy_Dense_Matrix_dist(int_t M, int_t N
  * </pre>
  */
     int    i, j;
-    
+
     for (j = 0; j < N; ++j)
         for (i = 0; i < M; ++i)
             Y[i + j*ldy] = X[i + j*ldx];
 }
 
 void
-zCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz, 
+zCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz,
 			      doublecomplex *nzval, int_t *nzval_colptr,
 			      int_t *rowind, int_t *rowind_colptr,
 			      int_t *col_to_sup, int_t *sup_to_col,
@@ -285,7 +285,7 @@ zCreate_SuperNode_Matrix_dist(SuperMatri
  *  and shape as A.
  *  The clone operation would copy all the non-pointer structure members like
  *  nrow, ncol, Stype, Dtype, Mtype from A and allocate a new nested Store
- *  structure. It would also copy nnz_loc, m_loc, fst_row from A->Store 
+ *  structure. It would also copy nnz_loc, m_loc, fst_row from A->Store
  *  into B->Store. It does not copy the matrix entries, row pointers,
  *  or column indices.
  */
@@ -316,16 +316,14 @@ void zClone_CompRowLoc_Matrix_dist(Super
     return;
 }
 
-/* \brief Copy: Call the clone operation and then copies all entries,
- *  row pointers, and column indices of a matrix into another matrix of
- *  the same type, B_{i,j}=A_{i,j}, for i,j=1,...,n
+/* \brief Copy: copies all entries, row pointers, and column indices of
+ *  a matrix into another matrix of the same type,
+ *  B_{i,j}=A_{i,j}, for i,j=1,...,n
  */
 void zCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
 {
     NRformat_loc  *Astore, *Bstore;
 
-    zClone_CompRowLoc_Matrix_dist(A, B);
-
     Astore = (NRformat_loc *) A->Store;
     Bstore = (NRformat_loc *) B->Store;
 
@@ -425,7 +423,7 @@ zFillRHS_dist(char *trans, int_t nrhs, d
 
 /*! \brief Fills a doublecomplex precision array with a given value.
  */
-void 
+void
 zfill_dist(doublecomplex *a, int_t alen, doublecomplex dval)
 {
     register int_t i;
@@ -434,7 +432,7 @@ zfill_dist(doublecomplex *a, int_t alen,
 
 
 
-/*! \brief Check the inf-norm of the error vector 
+/*! \brief Check the inf-norm of the error vector
  */
 void zinf_norm_error_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx,
 			  doublecomplex *xtrue, int_t ldxtrue,
@@ -462,7 +460,7 @@ void zinf_norm_error_dist(int_t n, int_t
 void PrintDoublecomplex(char *name, int_t len, doublecomplex *x)
 {
     register int_t i;
-    
+
     printf("%10s:\tReal\tImag\n", name);
     for (i = 0; i < len; ++i)
 	printf("\t" IFMT "\t%.4f\t%.4f\n", i, x[i].r, x[i].i);
@@ -471,7 +469,7 @@ void PrintDoublecomplex(char *name, int_
 int file_PrintDoublecomplex(FILE *fp, char *name, int_t len, doublecomplex *x)
 {
     register int_t i;
-    
+
     fprintf(fp, "%10s:\tReal\tImag\n", name);
     for (i = 0; i < len; ++i)
 	fprintf(fp, "\t" IFMT "\t%.4f\t%.4f\n", i, x[i].r, x[i].i);
@@ -506,7 +504,7 @@ void zPrintLblocks(int iam, int_t nsuper
 		   iam, gb, lb, nsupc, nb);
 	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
 		len = index[k+1];
-		printf("[%d] row-block %d: block # " IFMT "\tlength %d\n", 
+		printf("[%d] row-block %d: block # " IFMT "\tlength %d\n",
 		       iam, c, index[k], len);
 		PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]);
 		for (j = 0; j < nsupc; ++j) {
@@ -523,7 +521,7 @@ void zPrintLblocks(int iam, int_t nsuper
     printf("nfrecvx " IFMT "\n", Llu->nfrecvx);
     k = CEILING( nsupers, grid->nprow );
     PrintInt10("fmod", k, Llu->fmod);
-    
+
 } /* ZPRINTLBLOCKS */
 
 
@@ -574,8 +572,8 @@ void zDumpLblocks(int iam, int_t nsupers
     int_t *index;
     doublecomplex *nzval;
 	char filename[256];
-	FILE *fp, *fopen();	
- 
+	FILE *fp, *fopen();
+
 	// assert(grid->npcol*grid->nprow==1);
 
 	// count nonzeros in the first pass
@@ -595,27 +593,27 @@ void zDumpLblocks(int iam, int_t nsupers
 	    nsupc = SuperSize( gb );
 	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
 		len = index[k+1];
-		
+
 		for (j = 0; j < nsupc; ++j) {
 		for (i=0; i<len; ++i){
-		
+
 		if(index[k+LB_DESCRIPTOR+i]+1>=xsup[gb]+j+1){
-			nnzL ++; 
-			nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1);  
+			nnzL ++;
+			nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1);
 			n = nmax;
 		}
-		
+
 		}
 		}
 		k += LB_DESCRIPTOR + len;
 		r += len;
 	    }
-	}	
-    }	
+	}
+    }
 	MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm);
-	MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm);	
-	
-	snprintf(filename, sizeof(filename), "%s-%d", "L", iam);    
+	MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm);
+
+	snprintf(filename, sizeof(filename), "%s-%d", "L", iam);
     printf("Dumping L factor to --> %s\n", filename);
  	if ( !(fp = fopen(filename, "w")) ) {
 			ABORT("File open failed");
@@ -624,7 +622,7 @@ void zDumpLblocks(int iam, int_t nsupers
 	if(grid->iam==0){
 		fprintf(fp, "%d %d %d\n", n,n,nnzL);
 	}
-	
+
      ncb = nsupers / grid->npcol;
     extra = nsupers % grid->npcol;
     mycol = MYCOL( iam, grid );
@@ -639,29 +637,29 @@ void zDumpLblocks(int iam, int_t nsupers
 	    nsupc = SuperSize( gb );
 	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
 		len = index[k+1];
-		
+
 		for (j = 0; j < nsupc; ++j) {
 		for (i=0; i<len; ++i){
 			fprintf(fp, IFMT IFMT " %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, (double)iam);
-#if 0		
+#if 0
 			fprintf(fp, IFMT IFMT " %e %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, nzval[r +i+ j*nsupr].r,nzval[r +i+ j*nsupr].i);
-#endif		
+#endif
 		}
 		}
 		k += LB_DESCRIPTOR + len;
 		r += len;
 	    }
-	}	
+	}
     }
  	fclose(fp);
- 	
+
 } /* zDumpLblocks */
 
 
 
 /*! \brief Print the blocks in the factored matrix U.
  */
-void zPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid, 
+void zPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
 {
     register int c, extra, jb, k, lb, len, nb, nrb, nsupc;
@@ -686,7 +684,7 @@ void zPrintUblocks(int iam, int_t nsuper
 	    for (c = 0, k = BR_HEADER; c < nb; ++c) {
 		jb = index[k];
 		len = index[k+1];
-		printf("[%d] col-block %d: block # %d\tlength " IFMT "\n", 
+		printf("[%d] col-block %d: block # %d\tlength " IFMT "\n",
 		       iam, c, jb, index[k+1]);
 		nsupc = SuperSize( jb );
 		PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]);
