Actual source code: bvlapack.c
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV private kernels that use the LAPACK
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepcblaslapack.h>
17: /*
18: Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
19: */
20: SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
21: {
22: PetscBLASInt i,n=*len;
23: PetscReal *x = (PetscReal*)in,*y = (PetscReal*)inout;
25: PetscFunctionBegin;
26: if (PetscUnlikely(*datatype!=MPIU_REAL)) {
27: (void)(*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
28: MPI_Abort(PETSC_COMM_WORLD,1);
29: }
30: for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
31: PetscFunctionReturnVoid();
32: }
34: /*
35: Compute ||A|| for an mxn matrix
36: */
37: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscInt lda_,NormType type,PetscReal *nrm,PetscBool mpi)
38: {
39: PetscBLASInt m,n,lda,i,j;
40: PetscMPIInt len;
41: PetscReal lnrm,*rwork=NULL,*rwork2=NULL;
43: PetscFunctionBegin;
44: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
45: PetscCall(PetscBLASIntCast(m_,&m));
46: PetscCall(PetscBLASIntCast(n_,&n));
47: PetscCall(PetscBLASIntCast(lda_,&lda));
48: if (type==NORM_FROBENIUS || type==NORM_2) {
49: lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&lda,rwork);
50: if (mpi) PetscCallMPI(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
51: else *nrm = lnrm;
52: PetscCall(PetscLogFlops(2.0*m*n));
53: } else if (type==NORM_1) {
54: if (mpi) {
55: PetscCall(BVAllocateWork_Private(bv,2*n_));
56: rwork = (PetscReal*)bv->work;
57: rwork2 = rwork+n_;
58: PetscCall(PetscArrayzero(rwork,n_));
59: PetscCall(PetscArrayzero(rwork2,n_));
60: for (j=0;j<n_;j++) {
61: for (i=0;i<m_;i++) {
62: rwork[j] += PetscAbsScalar(A[i+j*lda_]);
63: }
64: }
65: PetscCall(PetscMPIIntCast(n_,&len));
66: PetscCallMPI(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
67: *nrm = 0.0;
68: for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
69: } else {
70: *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&lda,rwork);
71: }
72: PetscCall(PetscLogFlops(1.0*m*n));
73: } else if (type==NORM_INFINITY) {
74: PetscCall(BVAllocateWork_Private(bv,m_));
75: rwork = (PetscReal*)bv->work;
76: lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&lda,rwork);
77: if (mpi) PetscCallMPI(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv)));
78: else *nrm = lnrm;
79: PetscCall(PetscLogFlops(1.0*m*n));
80: }
81: PetscCall(PetscFPTrapPop());
82: PetscFunctionReturn(PETSC_SUCCESS);
83: }
85: /*
86: Normalize the columns of an mxn matrix A
87: */
88: PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscInt lda_,PetscScalar *eigi,PetscBool mpi)
89: {
90: PetscBLASInt m,lda,j,k,info,zero=0;
91: PetscMPIInt len;
92: PetscReal *norms,*rwork=NULL,*rwork2=NULL,done=1.0;
94: PetscFunctionBegin;
95: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
96: PetscCall(PetscBLASIntCast(m_,&m));
97: PetscCall(PetscBLASIntCast(lda_,&lda));
98: PetscCall(BVAllocateWork_Private(bv,2*n_));
99: rwork = (PetscReal*)bv->work;
100: rwork2 = rwork+n_;
101: /* compute local norms */
102: for (j=0;j<n_;j++) {
103: k = 1;
104: if (!PetscDefined(USE_COMPLEX) && eigi && eigi[j] != 0.0) k = 2;
105: rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*lda_),&lda,rwork2);
106: if (k==2) { rwork[j+1] = rwork[j]; j++; }
107: }
108: /* reduction to get global norms */
109: if (mpi) {
110: PetscCall(PetscMPIIntCast(n_,&len));
111: PetscCall(PetscArrayzero(rwork2,n_));
112: PetscCallMPI(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
113: norms = rwork2;
114: } else norms = rwork;
115: /* scale columns */
116: for (j=0;j<n_;j++) {
117: k = 1;
118: #if !defined(PETSC_USE_COMPLEX)
119: if (eigi && eigi[j] != 0.0) k = 2;
120: #endif
121: PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*lda_),&lda,&info));
122: SlepcCheckLapackInfo("lascl",info);
123: if (k==2) j++;
124: }
125: PetscCall(PetscLogFlops(3.0*m*n_));
126: PetscCall(PetscFPTrapPop());
127: PetscFunctionReturn(PETSC_SUCCESS);
128: }
130: /*
131: Compute the upper Cholesky factor in R and its inverse in S.
132: If S == R then the inverse overwrites the Cholesky factor.
133: */
134: PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
135: {
136: PetscInt i,k,l,n,m,ld,lds;
137: PetscScalar *pR,*pS;
138: PetscBLASInt info,n_ = 0,m_ = 0,ld_,lds_;
140: PetscFunctionBegin;
141: l = bv->l;
142: k = bv->k;
143: PetscCall(MatGetSize(R,&m,NULL));
144: n = k-l;
145: PetscCall(PetscBLASIntCast(m,&m_));
146: PetscCall(PetscBLASIntCast(n,&n_));
147: ld = m;
148: ld_ = m_;
149: PetscCall(MatDenseGetArray(R,&pR));
151: if (S==R) {
152: PetscCall(BVAllocateWork_Private(bv,m*k));
153: pS = bv->work;
154: lds = ld;
155: lds_ = ld_;
156: } else {
157: PetscCall(MatDenseGetArray(S,&pS));
158: PetscCall(MatGetSize(S,&lds,NULL));
159: PetscCall(PetscBLASIntCast(lds,&lds_));
160: }
162: /* save a copy of matrix in S */
163: for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
165: /* compute upper Cholesky factor in R */
166: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
167: PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
168: PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
170: if (info) { /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
171: for (i=l;i<k;i++) {
172: PetscCall(PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n));
173: pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
174: }
175: PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
176: SlepcCheckLapackInfo("potrf",info);
177: PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
178: }
180: /* compute S = inv(R) */
181: if (S==R) {
182: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
183: } else {
184: PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
185: for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
186: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
187: }
188: SlepcCheckLapackInfo("trtri",info);
189: PetscCall(PetscFPTrapPop());
190: PetscCall(PetscLogFlops(0.33*n*n*n));
192: /* Zero out entries below the diagonal */
193: for (i=l;i<k-1;i++) {
194: PetscCall(PetscArrayzero(pR+i*ld+i+1,(k-i-1)));
195: if (S!=R) PetscCall(PetscArrayzero(pS+i*lds+i+1,(k-i-1)));
196: }
197: PetscCall(MatDenseRestoreArray(R,&pR));
198: if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
199: PetscFunctionReturn(PETSC_SUCCESS);
200: }
202: /*
203: Compute the inverse of an upper triangular matrix R, store it in S.
204: If S == R then the inverse overwrites R.
205: */
206: PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
207: {
208: PetscInt i,k,l,n,m,ld,lds;
209: PetscScalar *pR,*pS;
210: PetscBLASInt info,n_,m_ = 0,ld_,lds_;
212: PetscFunctionBegin;
213: l = bv->l;
214: k = bv->k;
215: PetscCall(MatGetSize(R,&m,NULL));
216: n = k-l;
217: PetscCall(PetscBLASIntCast(m,&m_));
218: PetscCall(PetscBLASIntCast(n,&n_));
219: ld = m;
220: ld_ = m_;
221: PetscCall(MatDenseGetArray(R,&pR));
223: if (S==R) {
224: PetscCall(BVAllocateWork_Private(bv,m*k));
225: pS = bv->work;
226: lds = ld;
227: lds_ = ld_;
228: } else {
229: PetscCall(MatDenseGetArray(S,&pS));
230: PetscCall(MatGetSize(S,&lds,NULL));
231: PetscCall(PetscBLASIntCast(lds,&lds_));
232: }
234: /* compute S = inv(R) */
235: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
236: if (S==R) {
237: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
238: } else {
239: PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
240: for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
241: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
242: }
243: SlepcCheckLapackInfo("trtri",info);
244: PetscCall(PetscFPTrapPop());
245: PetscCall(PetscLogFlops(0.33*n*n*n));
247: PetscCall(MatDenseRestoreArray(R,&pR));
248: if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
249: PetscFunctionReturn(PETSC_SUCCESS);
250: }
252: /*
253: Compute the matrix to be used for post-multiplying the basis in the SVQB
254: block orthogonalization method.
255: On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
256: the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
257: If S == R then the result overwrites R.
258: */
259: PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
260: {
261: PetscInt i,j,k,l,n,m,ld,lds;
262: PetscScalar *pR,*pS,*D,*work,a;
263: PetscReal *eig,dummy;
264: PetscBLASInt info,lwork,n_,m_ = 0,ld_,lds_;
265: #if defined(PETSC_USE_COMPLEX)
266: PetscReal *rwork,rdummy;
267: #endif
269: PetscFunctionBegin;
270: l = bv->l;
271: k = bv->k;
272: PetscCall(MatGetSize(R,&m,NULL));
273: PetscCall(MatDenseGetLDA(R,&ld));
274: n = k-l;
275: PetscCall(PetscBLASIntCast(m,&m_));
276: PetscCall(PetscBLASIntCast(n,&n_));
277: ld_ = m_;
278: PetscCall(MatDenseGetArray(R,&pR));
280: if (S==R) {
281: pS = pR;
282: lds = ld;
283: lds_ = ld_;
284: } else {
285: PetscCall(MatDenseGetArray(S,&pS));
286: PetscCall(MatDenseGetLDA(S,&lds));
287: PetscCall(PetscBLASIntCast(lds,&lds_));
288: }
289: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
291: /* workspace query and memory allocation */
292: lwork = -1;
293: #if defined(PETSC_USE_COMPLEX)
294: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
295: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork));
296: PetscCall(PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork));
297: #else
298: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
299: PetscCall(PetscBLASIntCast((PetscInt)a,&lwork));
300: PetscCall(PetscMalloc3(n,&eig,n,&D,lwork,&work));
301: #endif
303: /* copy and scale matrix */
304: for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
305: for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
306: for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];
308: /* compute eigendecomposition */
309: #if defined(PETSC_USE_COMPLEX)
310: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
311: #else
312: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
313: #endif
314: SlepcCheckLapackInfo("syev",info);
316: if (S!=R) { /* R = U' */
317: for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
318: }
320: /* compute S = D*U*Lambda^{-1/2} */
321: for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
322: for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);
324: if (S!=R) { /* compute R = inv(S) = Lambda^{1/2}*U'/D */
325: for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
326: for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
327: }
329: #if defined(PETSC_USE_COMPLEX)
330: PetscCall(PetscFree4(eig,D,work,rwork));
331: #else
332: PetscCall(PetscFree3(eig,D,work));
333: #endif
334: PetscCall(PetscLogFlops(9.0*n*n*n));
335: PetscCall(PetscFPTrapPop());
337: PetscCall(MatDenseRestoreArray(R,&pR));
338: if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
339: PetscFunctionReturn(PETSC_SUCCESS);
340: }
342: /*
343: QR factorization of an mxn matrix via parallel TSQR
344: */
345: PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscInt ldq_,PetscScalar *R,PetscInt ldr)
346: {
347: PetscInt level,plevel,nlevels,lda,worklen;
348: PetscBLASInt m,n,ldq,i,j,k,l,nb,sz,lwork,info;
349: PetscScalar *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
350: PetscMPIInt rank,size,count,stride,powtwo,s = 0;
351: MPI_Datatype tmat;
353: PetscFunctionBegin;
354: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
355: PetscCall(PetscBLASIntCast(m_,&m));
356: PetscCall(PetscBLASIntCast(n_,&n));
357: PetscCall(PetscBLASIntCast(ldq_,&ldq));
358: k = PetscMin(m,n);
359: nb = 16;
360: lda = 2*n;
361: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
362: PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank));
363: nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
364: PetscCall(PetscMPIIntCast(PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size))),&powtwo));
365: worklen = n+n*nb;
366: if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
367: PetscCall(BVAllocateWork_Private(bv,worklen));
368: tau = bv->work;
369: work = bv->work+n;
370: PetscCall(PetscBLASIntCast(n*nb,&lwork));
371: if (nlevels) {
372: A = bv->work+n+n*nb;
373: QQ = bv->work+n+n*nb+n*lda;
374: C = bv->work+n+n*nb+n*lda+n*lda*nlevels;
375: }
377: /* Compute QR */
378: PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&ldq,tau,work,&lwork,&info));
379: SlepcCheckLapackInfo("geqrf",info);
381: /* Extract R */
382: if (R || nlevels) {
383: for (j=0;j<n;j++) {
384: for (i=0;i<=PetscMin(j,m-1);i++) {
385: if (nlevels) A[i+j*lda] = Q[i+j*ldq];
386: else R[i+j*ldr] = Q[i+j*ldq];
387: }
388: for (i=PetscMin(j,m-1)+1;i<n;i++) {
389: if (nlevels) A[i+j*lda] = 0.0;
390: else R[i+j*ldr] = 0.0;
391: }
392: }
393: }
395: /* Compute orthogonal matrix in Q */
396: PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&ldq,tau,work,&lwork,&info));
397: SlepcCheckLapackInfo("orgqr",info);
399: if (nlevels) {
401: PetscCall(PetscMPIIntCast(n,&count));
402: PetscCall(PetscMPIIntCast(lda,&stride));
403: PetscCall(PetscBLASIntCast(lda,&l));
404: PetscCallMPI(MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat));
405: PetscCallMPI(MPI_Type_commit(&tmat));
407: for (level=nlevels;level>=1;level--) {
409: plevel = PetscPowInt(2,level);
410: PetscCall(PetscMPIIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
412: /* Stack triangular matrices */
413: if (rank<s && s<size) { /* send top part, receive bottom part */
414: PetscCallMPI(MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
415: } else if (s<size) { /* copy top to bottom, receive top part */
416: PetscCallMPI(MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
417: PetscCallMPI(MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
418: }
419: if (level<nlevels && size!=powtwo) { /* for cases when size is not a power of 2 */
420: if (rank<size-powtwo) { /* send bottom part */
421: PetscCallMPI(MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv)));
422: } else if (rank>=powtwo) { /* receive bottom part */
423: PetscCallMPI(MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
424: }
425: }
426: /* Compute QR and build orthogonal matrix */
427: if (level<nlevels || (level==nlevels && s<size)) {
428: PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
429: SlepcCheckLapackInfo("geqrf",info);
430: PetscCall(PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda));
431: PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
432: SlepcCheckLapackInfo("orgqr",info);
433: for (j=0;j<n;j++) {
434: for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
435: }
436: } else if (level==nlevels) { /* only one triangular matrix, set Q=I */
437: PetscCall(PetscArrayzero(QQ+(level-1)*n*lda,n*lda));
438: for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
439: }
440: }
442: /* Extract R */
443: if (R) {
444: for (j=0;j<n;j++) {
445: for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
446: for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
447: }
448: }
450: /* Accumulate orthogonal matrices */
451: for (level=1;level<=nlevels;level++) {
452: plevel = PetscPowInt(2,level);
453: PetscCall(PetscMPIIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
454: Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
455: if (level<nlevels) {
456: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
457: PetscCall(PetscArraycpy(QQ+level*n*lda,C,n*lda));
458: } else {
459: for (i=0;i<m/l;i++) {
460: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&ldq,Qhalf,&l,&zero,C,&l));
461: for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+i*l+j*ldq,C+j*l,l));
462: }
463: sz = m%l;
464: if (sz) {
465: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&ldq,Qhalf,&l,&zero,C,&l));
466: for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+(m/l)*l+j*ldq,C+j*l,sz));
467: }
468: }
469: }
471: PetscCallMPI(MPI_Type_free(&tmat));
472: }
474: PetscCall(PetscLogFlops(3.0*m*n*n));
475: PetscCall(PetscFPTrapPop());
476: PetscFunctionReturn(PETSC_SUCCESS);
477: }
479: /*
480: Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
481: all matrices are upper triangular stored in packed format
482: */
483: SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
484: {
485: PetscBLASInt n,i,j,k,one=1;
486: PetscMPIInt tsize;
487: PetscScalar v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
488: PetscReal c;
490: PetscFunctionBegin;
491: PetscCallMPIAbort(PETSC_COMM_SELF,MPI_Type_size(*datatype,&tsize)); /* we assume len=1 */
492: tsize /= sizeof(PetscScalar);
493: n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
494: for (j=0;j<n;j++) {
495: for (i=0;i<=j;i++) {
496: LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
497: R1[(2*n-j-1)*j/2+j] = v;
498: k = n-j-1;
499: if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
500: }
501: }
502: PetscFunctionReturnVoid();
503: }
505: /*
506: Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
507: */
508: PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscInt ldq_,PetscScalar *R,PetscInt ldr)
509: {
510: PetscInt worklen;
511: PetscBLASInt m,n,ldq,i,j,s,nb,lwork,info;
512: PetscScalar *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
513: PetscMPIInt size,count;
514: MPI_Datatype tmat;
516: PetscFunctionBegin;
517: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
518: PetscCall(PetscBLASIntCast(m_,&m));
519: PetscCall(PetscBLASIntCast(n_,&n));
520: PetscCall(PetscBLASIntCast(ldq_,&ldq));
521: nb = 16;
522: s = n+n*(n-1)/2; /* length of packed triangular matrix */
523: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
524: worklen = n+n*nb+2*s+ldq*n;
525: PetscCall(BVAllocateWork_Private(bv,worklen));
526: tau = bv->work;
527: work = bv->work+n;
528: R1 = bv->work+n+n*nb;
529: R2 = bv->work+n+n*nb+s;
530: A = bv->work+n+n*nb+2*s;
531: PetscCall(PetscBLASIntCast(n*nb,&lwork));
532: PetscCall(PetscArraycpy(A,Q,ldq*n));
534: /* Compute QR */
535: PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&ldq,tau,work,&lwork,&info));
536: SlepcCheckLapackInfo("geqrf",info);
538: if (size==1) {
539: /* Extract R */
540: for (j=0;j<n;j++) {
541: for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*ldq];
542: for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
543: }
544: } else {
545: /* Use MPI reduction operation to obtain global R */
546: PetscCall(PetscMPIIntCast(s,&count));
547: PetscCallMPI(MPI_Type_contiguous(count,MPIU_SCALAR,&tmat));
548: PetscCallMPI(MPI_Type_commit(&tmat));
549: for (i=0;i<n;i++) {
550: for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*ldq]:0.0;
551: }
552: PetscCallMPI(MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv)));
553: for (i=0;i<n;i++) {
554: for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
555: for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
556: }
557: PetscCallMPI(MPI_Type_free(&tmat));
558: }
560: PetscCall(PetscLogFlops(3.0*m*n*n));
561: PetscCall(PetscFPTrapPop());
562: PetscFunctionReturn(PETSC_SUCCESS);
563: }