Fast Auxiliary Space Preconditioning 2.7.7 Aug/28/2022
Loading...
Searching...
No Matches
BlaSparseBSR.c
Go to the documentation of this file.
1
15#include <math.h>
16
17#ifdef _OPENMP
18#include <omp.h>
19#endif
20
21#include "fasp.h"
22#include "fasp_functs.h"
23
24static double dense_matrix_max_internal(double* A, int m, int n);
25static void generate_S_theta_bsr(dBSRmat* A, iCSRmat* S, REAL theta);
26
27/*---------------------------------*/
28/*-- Public Functions --*/
29/*---------------------------------*/
30
49 const INT ROW, const INT COL, const INT NNZ, const INT nb, const INT storage_manner)
50{
51 dBSRmat A;
52
53 if (ROW > 0) {
54 A.IA = (INT*)fasp_mem_calloc(ROW + 1, sizeof(INT));
55 } else {
56 A.IA = NULL;
57 }
58
59 if (NNZ > 0) {
60 A.JA = (INT*)fasp_mem_calloc(NNZ, sizeof(INT));
61 } else {
62 A.JA = NULL;
63 }
64
65 if (nb > 0 && NNZ > 0) {
66 A.val = (REAL*)fasp_mem_calloc(NNZ * nb * nb, sizeof(REAL));
67 } else {
68 A.val = NULL;
69 }
70
71 A.storage_manner = storage_manner;
72 A.ROW = ROW;
73 A.COL = COL;
74 A.NNZ = NNZ;
75 A.nb = nb;
76
77 return A;
78}
79
96void fasp_dbsr_alloc(const INT ROW,
97 const INT COL,
98 const INT NNZ,
99 const INT nb,
100 const INT storage_manner,
101 dBSRmat* A)
102{
103 if (ROW > 0) {
104 A->IA = (INT*)fasp_mem_calloc(ROW + 1, sizeof(INT));
105 } else {
106 A->IA = NULL;
107 }
108
109 if (NNZ > 0) {
110 A->JA = (INT*)fasp_mem_calloc(NNZ, sizeof(INT));
111 } else {
112 A->JA = NULL;
113 }
114
115 if (nb > 0) {
116 A->val = (REAL*)fasp_mem_calloc(NNZ * nb * nb, sizeof(REAL));
117 } else {
118 A->val = NULL;
119 }
120
121 A->storage_manner = storage_manner;
122 A->ROW = ROW;
123 A->COL = COL;
124 A->NNZ = NNZ;
125 A->nb = nb;
126
127 return;
128}
129
141{
142 if (A == NULL) return;
143
144 fasp_mem_free(A->IA);
145 A->IA = NULL;
146 fasp_mem_free(A->JA);
147 A->JA = NULL;
148 fasp_mem_free(A->val);
149 A->val = NULL;
150
151 A->ROW = 0;
152 A->COL = 0;
153 A->NNZ = 0;
154 A->nb = 0;
155 A->storage_manner = 0;
156}
157
169void fasp_dbsr_cp(const dBSRmat* A, dBSRmat* B)
170{
171 B->ROW = A->ROW;
172 B->COL = A->COL;
173 B->NNZ = A->NNZ;
174 B->nb = A->nb;
176
177 memcpy(B->IA, A->IA, (A->ROW + 1) * sizeof(INT));
178 memcpy(B->JA, A->JA, (A->NNZ) * sizeof(INT));
179 memcpy(B->val, A->val, (A->NNZ) * (A->nb) * (A->nb) * sizeof(REAL));
180}
181
198{
199 const INT row = A->ROW;
200 const INT nnz = A->NNZ;
201 const INT nb = A->nb, nb2 = nb * nb;
202
203 INT i, j, k;
204 INT ibegin, iend = A->IA[0];
205 SHORT status = FASP_SUCCESS;
206 k = 0;
207 for (i = 0; i < row; ++i) {
208 ibegin = iend;
209 iend = A->IA[i + 1];
210 for (j = ibegin; j < iend; ++j)
211 if (dense_matrix_max_internal(&A->val[j * nb2], nb, nb) > dtol ||
212 i == A->JA[j]) {
213 A->JA[k] = A->JA[j];
214 // A->val[k] = A->val[j];
215 memcpy(A->val + k * nb2, A->val + j * nb2, (nb2) * sizeof(REAL));
216 ++k;
217 } /* end of j */
218 A->IA[i + 1] = k;
219 } /* end of i */
220
221 if (k <= nnz) {
222 A->NNZ = k;
223 A->JA = (INT*)fasp_mem_realloc(A->JA, k * sizeof(INT));
224 A->val = (REAL*)fasp_mem_realloc(A->val, k * nb2 * sizeof(REAL));
225 } else {
226 printf("### WARNING: Size of compressed matrix is bigger than original!\n");
227 status = ERROR_UNKNOWN;
228 }
229
230 return (status);
231}
232
247{
248 const INT n = A->ROW, m = A->COL, nnz = A->NNZ, nb = A->nb;
249
250 INT status = FASP_SUCCESS;
251 INT i, j, k, p, inb, jnb, nb2;
252
253 AT->ROW = m;
254 AT->COL = n;
255 AT->NNZ = nnz;
256 AT->nb = nb;
258
259 AT->IA = (INT*)fasp_mem_calloc(m + 1, sizeof(INT));
260 AT->JA = (INT*)fasp_mem_calloc(nnz, sizeof(INT));
261 nb2 = nb * nb;
262
263 if (A->val) {
264 AT->val = (REAL*)fasp_mem_calloc(nnz * nb2, sizeof(REAL));
265 } else {
266 AT->val = NULL;
267 }
268
269 // first pass: find the number of nonzeros in the first m-1 columns of A
270 // Note: these numbers are stored in the array AT.IA from 1 to m-1
271 fasp_iarray_set(m + 1, AT->IA, 0);
272
273 for (j = 0; j < nnz; ++j) {
274 i = A->JA[j]; // column number of A = row number of A'
275 if (i < m - 1) AT->IA[i + 2]++;
276 }
277
278 for (i = 2; i <= m; ++i) AT->IA[i] += AT->IA[i - 1];
279
280 // second pass: form A'
281 if (A->val) {
282 for (i = 0; i < n; ++i) {
283 INT ibegin = A->IA[i], iend1 = A->IA[i + 1];
284 for (p = ibegin; p < iend1; p++) {
285 j = A->JA[p] + 1;
286 k = AT->IA[j];
287 AT->JA[k] = i;
288 for (inb = 0; inb < nb; inb++)
289 for (jnb = 0; jnb < nb; jnb++)
290 AT->val[nb2 * k + inb * nb + jnb] =
291 A->val[nb2 * p + jnb * nb + inb];
292 AT->IA[j] = k + 1;
293 } // end for p
294 } // end for i
295
296 } else {
297 for (i = 0; i < n; ++i) {
298 INT ibegin = A->IA[i], iend1 = A->IA[i + 1];
299 for (p = ibegin; p < iend1; p++) {
300 j = A->JA[p] + 1;
301 k = AT->IA[j];
302 AT->JA[k] = i;
303 AT->IA[j] = k + 1;
304 } // end for p
305 } // end of i
306
307 } // end if
308
309 return (status);
310}
311
333 const INT* Is,
334 const INT* Js,
335 const INT m,
336 const INT n,
337 dBSRmat* B)
338{
339 INT status = FASP_SUCCESS;
340 INT i, j, k, nnz = 0;
341 INT* col_flag;
342 SHORT use_openmp = FALSE;
343
344 const INT nb = A->nb;
345 const INT nb2 = nb * nb;
346
347#ifdef _OPENMP
348 INT myid, mybegin, stride_i, myend, nthreads;
349 if (n > OPENMP_HOLDS) {
350 use_openmp = TRUE;
351 nthreads = fasp_get_num_threads();
352 }
353#endif
354
355 // create colum flags
356 col_flag = (INT*)fasp_mem_calloc(A->COL, sizeof(INT));
357
358 B->ROW = m;
359 B->COL = n;
360 B->nb = nb;
362
363 B->IA = (INT*)fasp_mem_calloc(m + 1, sizeof(INT));
364
365 if (use_openmp) {
366#ifdef _OPENMP
367 stride_i = n / nthreads;
368#pragma omp parallel private(myid, mybegin, myend, i) num_threads(nthreads)
369 {
370 myid = omp_get_thread_num();
371 mybegin = myid * stride_i;
372 if (myid < nthreads - 1)
373 myend = mybegin + stride_i;
374 else
375 myend = n;
376 for (i = mybegin; i < myend; ++i) {
377 col_flag[Js[i]] = i + 1;
378 }
379 }
380#endif
381 } else {
382 for (i = 0; i < n; ++i) col_flag[Js[i]] = i + 1;
383 }
384
385 // first pass: count nonzeros for sub matrix
386 B->IA[0] = 0;
387 for (i = 0; i < m; ++i) {
388 for (k = A->IA[Is[i]]; k < A->IA[Is[i] + 1]; ++k) {
389 j = A->JA[k];
390 if (col_flag[j] > 0) nnz++;
391 } /* end for k */
392 B->IA[i + 1] = nnz;
393 } /* end for i */
394 B->NNZ = nnz;
395
396 // allocate
397 B->JA = (INT*)fasp_mem_calloc(nnz, sizeof(INT));
398 B->val = (REAL*)fasp_mem_calloc(nnz * nb2, sizeof(REAL));
399
400 // second pass: copy data to B
401 nnz = 0;
402 for (i = 0; i < m; ++i) {
403 for (k = A->IA[Is[i]]; k < A->IA[Is[i] + 1]; ++k) {
404 j = A->JA[k];
405 if (col_flag[j] > 0) {
406 B->JA[nnz] = col_flag[j] - 1;
407 memcpy(B->val + nnz * nb2, A->val + k * nb2, nb2 * sizeof(REAL));
408 nnz++;
409 }
410 } /* end for k */
411 } /* end for i */
412
413 fasp_mem_free(col_flag);
414 col_flag = NULL;
415
416 return (status);
417}
418
436{
437 SHORT status = FASP_SUCCESS;
438 const INT num_rowsA = A->ROW;
439 const INT num_colsA = A->COL;
440 const INT nb = A->nb;
441 const INT nb2 = nb * nb;
442
443 const INT* A_i = A->IA;
444 INT* A_j = A->JA;
445 REAL* A_data = A->val;
446
447 INT i, j, tempi, row_size;
448
449#ifdef _OPENMP
450 // variables for OpenMP
451 INT myid, mybegin, myend, ibegin, iend;
452 INT nthreads = fasp_get_num_threads();
453#endif
454
455 /* the matrix should be square */
456 if (num_rowsA != num_colsA) return ERROR_INPUT_PAR;
457
458#ifdef _OPENMP
459 if (num_rowsA > OPENMP_HOLDS) {
460 REAL* tempd = (REAL*)fasp_mem_calloc(nb2 * nthreads, sizeof(REAL));
461#pragma omp parallel for private(myid, mybegin, myend, i, j, tempi, ibegin, iend)
462 for (myid = 0; myid < nthreads; myid++) {
463 fasp_get_start_end(myid, nthreads, num_rowsA, &mybegin, &myend);
464 for (i = mybegin; i < myend; i++) {
465 ibegin = A_i[i + 1];
466 iend = A_i[i];
467 for (j = ibegin; j < iend; j++) {
468 if (A_j[j] == i) {
469 if (j != ibegin) {
470 // swap index
471 tempi = A_j[ibegin];
472 A_j[ibegin] = A_j[j];
473 A_j[j] = tempi;
474 // swap block
475 memcpy(tempd + myid * nb2, A_data + ibegin * nb2,
476 (nb2) * sizeof(REAL));
477 memcpy(A_data + ibegin * nb2, A_data + j * nb2,
478 (nb2) * sizeof(REAL));
479 memcpy(A_data + j * nb2, tempd + myid * nb2,
480 (nb2) * sizeof(REAL));
481 }
482 break;
483 }
484 /* diagonal element is missing */
485 if (j == iend - 1) {
486 status = -2;
487 break;
488 }
489 }
490 }
491 }
492 fasp_mem_free(tempd);
493 tempd = NULL;
494 } else {
495#endif
496 REAL* tempd = (REAL*)fasp_mem_calloc(nb2, sizeof(REAL));
497 for (i = 0; i < num_rowsA; i++) {
498 row_size = A_i[i + 1] - A_i[i];
499 for (j = 0; j < row_size; j++) {
500 if (A_j[j] == i) {
501 if (j != 0) {
502 // swap index
503 tempi = A_j[0];
504 A_j[0] = A_j[j];
505 A_j[j] = tempi;
506 // swap block
507 memcpy(tempd, A_data, (nb2) * sizeof(REAL));
508 memcpy(A_data, A_data + j * nb2, (nb2) * sizeof(REAL));
509 memcpy(A_data + j * nb2, tempd, (nb2) * sizeof(REAL));
510 }
511 break;
512 }
513 /* diagonal element is missing */
514 if (j == row_size - 1) return -2;
515 }
516 A_j += row_size;
517 A_data += row_size * nb2;
518 }
519 fasp_mem_free(tempd);
520 tempd = NULL;
521#ifdef _OPENMP
522 }
523#endif
524
525 if (status < 0)
526 return status;
527 else
528 return FASP_SUCCESS;
529}
530
544{
545 // members of A
546 const INT ROW = A->ROW;
547 const INT nb = A->nb;
548 const INT nb2 = nb * nb;
549 const INT size = ROW * nb2;
550 const INT* IA = A->IA;
551 const INT* JA = A->JA;
552 REAL* val = A->val;
553
554 dvector diaginv;
555
556 INT i, k;
557
558 // Variables for OpenMP
559 SHORT nthreads = 1, use_openmp = FALSE;
560 INT myid, mybegin, myend;
561
562#ifdef _OPENMP
563 if (ROW > OPENMP_HOLDS) {
564 use_openmp = TRUE;
565 nthreads = fasp_get_num_threads();
566 }
567#endif
568
569 // allocate memory
570 diaginv.row = size;
571 diaginv.val = (REAL*)fasp_mem_calloc(size, sizeof(REAL));
572
573 // get all the diagonal sub-blocks
574 if (use_openmp) {
575#ifdef _OPENMP
576#pragma omp parallel for private(myid, i, mybegin, myend, k)
577#endif
578 for (myid = 0; myid < nthreads; myid++) {
579 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
580 for (i = mybegin; i < myend; ++i) {
581 for (k = IA[i]; k < IA[i + 1]; ++k) {
582 if (JA[k] == i)
583 memcpy(diaginv.val + i * nb2, val + k * nb2,
584 nb2 * sizeof(REAL));
585 }
586 }
587 }
588 } else {
589 for (i = 0; i < ROW; ++i) {
590 for (k = IA[i]; k < IA[i + 1]; ++k) {
591 if (JA[k] == i)
592 memcpy(diaginv.val + i * nb2, val + k * nb2, nb2 * sizeof(REAL));
593 }
594 }
595 }
596 // compute the inverses of all the diagonal sub-blocks
597 if (use_openmp) {
598#ifdef _OPENMP
599#pragma omp parallel for private(myid, i, mybegin, myend)
600#endif
601 for (myid = 0; myid < nthreads; myid++) {
602 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
603 if (nb > 1) {
604 for (i = mybegin; i < myend; ++i) {
605 fasp_smat_inv(diaginv.val + i * nb2, nb);
606 }
607 } else {
608 for (i = mybegin; i < myend; ++i) {
609 // zero-diagonal should be tested previously
610 diaginv.val[i] = 1.0 / diaginv.val[i];
611 }
612 }
613 }
614 } else {
615 if (nb > 1) {
616 for (i = 0; i < ROW; ++i) {
617 fasp_smat_inv(diaginv.val + i * nb2, nb);
618 }
619 } else {
620 for (i = 0; i < ROW; ++i) {
621 // zero-diagonal should be tested previously
622 diaginv.val[i] = 1.0 / diaginv.val[i];
623 }
624 }
625 }
626
627 return (diaginv);
628}
629
646{
647 // members of A
648 const INT ROW = A->ROW;
649 const INT COL = A->COL;
650 const INT NNZ = A->NNZ;
651 const INT nb = A->nb;
652 const INT nb2 = nb * nb;
653 const INT size = ROW * nb2;
654 const INT* IA = A->IA;
655 const INT* JA = A->JA;
656 REAL* val = A->val;
657
658 // create a dBSRmat B
659 dBSRmat B = fasp_dbsr_create(ROW, COL, NNZ, nb, 0);
660 INT* IAb = B.IA;
661 INT* JAb = B.JA;
662 REAL* valb = B.val;
663
664 INT i, j, k, m, l;
665
666 // variables for OpenMP
667 SHORT nthreads = 1, use_openmp = FALSE;
668 INT myid, mybegin, myend;
669
670 // allocate memory
671 REAL* diaginv = (REAL*)fasp_mem_calloc(size, sizeof(REAL));
672
673 if (IAb)
674 memcpy(IAb, IA, (ROW + 1) * sizeof(INT));
675 else
676 goto FINISHED;
677
678 if (JAb)
679 memcpy(JAb, JA, NNZ * sizeof(INT));
680 else
681 goto FINISHED;
682
683#ifdef _OPENMP
684 if (ROW > OPENMP_HOLDS) {
685 use_openmp = TRUE;
686 nthreads = fasp_get_num_threads();
687 }
688#endif
689
690 // get all the diagonal sub-blocks
691 if (use_openmp) {
692#ifdef _OPENMP
693#pragma omp parallel for private(myid, i, mybegin, myend, k)
694#endif
695 for (myid = 0; myid < nthreads; myid++) {
696 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
697 for (i = mybegin; i < myend; ++i) {
698 for (k = IA[i]; k < IA[i + 1]; ++k) {
699 if (JA[k] == i)
700 memcpy(diaginv + i * nb2, val + k * nb2, nb2 * sizeof(REAL));
701 }
702 }
703 }
704 } else {
705 for (i = 0; i < ROW; ++i) {
706 for (k = IA[i]; k < IA[i + 1]; ++k) {
707 if (JA[k] == i)
708 memcpy(diaginv + i * nb2, val + k * nb2, nb2 * sizeof(REAL));
709 }
710 }
711 }
712
713 // compute the inverses of all the diagonal sub-blocks
714 if (use_openmp) {
715#ifdef _OPENMP
716#pragma omp parallel for private(myid, i, mybegin, myend)
717#endif
718 for (myid = 0; myid < nthreads; myid++) {
719 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
720 if (nb > 1) {
721 for (i = mybegin; i < myend; ++i) {
722 fasp_smat_inv(diaginv + i * nb2, nb);
723 }
724 } else {
725 for (i = mybegin; i < myend; ++i) {
726 // zero-diagonal should be tested previously
727 diaginv[i] = 1.0 / diaginv[i];
728 }
729 }
730 }
731 } else {
732 if (nb > 1) {
733 for (i = 0; i < ROW; ++i) {
734 fasp_smat_inv(diaginv + i * nb2, nb);
735 }
736 } else {
737 for (i = 0; i < ROW; ++i) {
738 // zero-diagonal should be tested previously
739 diaginv[i] = 1.0 / diaginv[i];
740 }
741 }
742 }
743
744 // compute D^{-1}*A
745 if (use_openmp) {
746#ifdef _OPENMP
747#pragma omp parallel for private(myid, mybegin, myend, i, k, m, j, l)
748#endif
749 for (myid = 0; myid < nthreads; myid++) {
750 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
751 for (i = mybegin; i < myend; ++i) {
752 for (k = IA[i]; k < IA[i + 1]; ++k) {
753 m = k * nb2;
754 j = JA[k];
755 if (j == i) {
756 // Identity sub-block
757 memset(valb + m, 0X0, nb2 * sizeof(REAL));
758 for (l = 0; l < nb; l++) valb[m + l * nb + l] = 1.0;
759 } else {
760 fasp_blas_smat_mul(diaginv + i * nb2, val + m, valb + m, nb);
761 }
762 }
763 }
764 }
765 } else {
766 for (i = 0; i < ROW; ++i) {
767 for (k = IA[i]; k < IA[i + 1]; ++k) {
768 m = k * nb2;
769 j = JA[k];
770 if (j == i) {
771 // Identity sub-block
772 memset(valb + m, 0X0, nb2 * sizeof(REAL));
773 for (l = 0; l < nb; l++) valb[m + l * nb + l] = 1.0;
774 } else {
775 fasp_blas_smat_mul(diaginv + i * nb2, val + m, valb + m, nb);
776 }
777 }
778 }
779 }
780
781FINISHED:
782 fasp_mem_free(diaginv);
783 diaginv = NULL;
784
785 return (B);
786}
787
804{
805 // members of A
806 const INT ROW = A->ROW;
807 const INT COL = A->COL;
808 const INT NNZ = A->NNZ;
809 const INT nb = A->nb, nbp1 = nb + 1;
810 const INT nb2 = nb * nb;
811
812 INT* IA = A->IA;
813 INT* JA = A->JA;
814 REAL* val = A->val;
815
816 dBSRmat B;
817 INT* IAb = NULL;
818 INT* JAb = NULL;
819 REAL* valb = NULL;
820
821 INT i, k, m, l, ibegin, iend;
822
823 // Variables for OpenMP
824 SHORT nthreads = 1, use_openmp = FALSE;
825 INT myid, mybegin, myend;
826
827#ifdef _OPENMP
828 if (ROW > OPENMP_HOLDS) {
829 use_openmp = TRUE;
830 nthreads = fasp_get_num_threads();
831 }
832#endif
833
834 // Create a dBSRmat 'B'
835 B = fasp_dbsr_create(ROW, COL, NNZ, nb, 0);
836 IAb = B.IA;
837 JAb = B.JA;
838 valb = B.val;
839
840 if (IAb)
841 memcpy(IAb, IA, (ROW + 1) * sizeof(INT));
842 else
843 goto FINISHED;
844
845 if (JAb)
846 memcpy(JAb, JA, NNZ * sizeof(INT));
847 else
848 goto FINISHED;
849
850 // compute D^{-1}*A
851 if (use_openmp) {
852#ifdef _OPENMP
853#pragma omp parallel for private(myid, i, mybegin, myend, ibegin, iend, k, m, l)
854#endif
855 for (myid = 0; myid < nthreads; myid++) {
856 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
857 for (i = mybegin; i < myend; ++i) {
858 ibegin = IA[i];
859 iend = IA[i + 1];
860 for (k = ibegin; k < iend; ++k) {
861 m = k * nb2;
862 if (JA[k] != i) {
863 fasp_blas_smat_mul(diaginv + i * nb2, val + m, valb + m, nb);
864 } else {
865 // Identity sub-block
866 memset(valb + m, 0X0, nb2 * sizeof(REAL));
867 for (l = 0; l < nb; l++) valb[m + l * nbp1] = 1.0;
868 }
869 }
870 }
871 }
872 } else {
873 // compute D^{-1}*A
874 for (i = 0; i < ROW; ++i) {
875 ibegin = IA[i];
876 iend = IA[i + 1];
877 for (k = ibegin; k < iend; ++k) {
878 m = k * nb2;
879 if (JA[k] != i) {
880 fasp_blas_smat_mul(diaginv + i * nb2, val + m, valb + m, nb);
881 } else {
882 // Identity sub-block
883 memset(valb + m, 0X0, nb2 * sizeof(REAL));
884 for (l = 0; l < nb; l++) valb[m + l * nbp1] = 1.0;
885 }
886 }
887 }
888 }
889
890FINISHED:
891 return (B);
892}
893
912{
913 dBSRmat B;
914 // members of A
915 INT ROW = A->ROW;
916 INT ROW_plus_one = ROW + 1;
917 INT COL = A->COL;
918 INT NNZ = A->NNZ;
919 INT nb = A->nb;
920 INT* IA = A->IA;
921 INT* JA = A->JA;
922 REAL* val = A->val;
923
924 INT* IAb = NULL;
925 INT* JAb = NULL;
926 REAL* valb = NULL;
927
928 INT nb2 = nb * nb;
929 INT i, j, k, m;
930
931 SHORT use_openmp = FALSE;
932
933#ifdef _OPENMP
934 INT myid, mybegin, myend, stride_i, nthreads = 1;
935 if (ROW > OPENMP_HOLDS) {
936 use_openmp = TRUE;
937 nthreads = fasp_get_num_threads();
938 }
939#endif
940
941 // Create a dBSRmat 'B'
942 B = fasp_dbsr_create(ROW, COL, NNZ, nb, 0);
943
944 IAb = B.IA;
945 JAb = B.JA;
946 valb = B.val;
947
948 fasp_iarray_cp(ROW_plus_one, IA, IAb);
949 fasp_iarray_cp(NNZ, JA, JAb);
950
951 switch (nb) {
952
953 case 2:
954 // main loop
955 if (use_openmp) {
956#ifdef _OPENMP
957 stride_i = ROW / nthreads;
958#pragma omp parallel private(myid, mybegin, myend, i, k, m, j) num_threads(nthreads)
959 {
960 myid = omp_get_thread_num();
961 mybegin = myid * stride_i;
962 if (myid < nthreads - 1)
963 myend = mybegin + stride_i;
964 else
965 myend = ROW;
966 for (i = mybegin; i < myend; ++i) {
967 // get the diagonal sub-blocks
968
969 k = IA[i];
970 m = k * 4;
971 memcpy(diaginv + i * 4, val + m, 4 * sizeof(REAL));
972 fasp_smat_identity_nc2(valb + m);
973
974 // compute the inverses of the diagonal sub-blocks
975 fasp_smat_inv_nc2(diaginv + i * 4);
976 // compute D^{-1}*A
977 for (k = IA[i] + 1; k < IA[i + 1]; ++k) {
978 m = k * 4;
979 j = JA[k];
980 fasp_blas_smat_mul_nc2(diaginv + i * 4, val + m, valb + m);
981 }
982 } // end of main loop
983 }
984#endif
985 } else {
986 // main loop
987 for (i = 0; i < ROW; ++i) {
988 // get the diagonal sub-blocks
989 k = IA[i];
990 m = k * 4;
991 memcpy(diaginv + i * 4, val + m, 4 * sizeof(REAL));
992 fasp_smat_identity_nc2(valb + m);
993
994 // compute the inverses of the diagonal sub-blocks
995 fasp_smat_inv_nc2(diaginv + i * 4);
996 // compute D^{-1}*A
997 for (k = IA[i] + 1; k < IA[i + 1]; ++k) {
998 m = k * 4;
999 fasp_blas_smat_mul_nc2(diaginv + i * 4, val + m, valb + m);
1000 }
1001 } // end of main loop
1002 }
1003
1004 break;
1005
1006 case 3:
1007 // main loop
1008 if (use_openmp) {
1009#ifdef _OPENMP
1010 stride_i = ROW / nthreads;
1011#pragma omp parallel private(myid, mybegin, myend, i, k, m, j) num_threads(nthreads)
1012 {
1013 myid = omp_get_thread_num();
1014 mybegin = myid * stride_i;
1015 if (myid < nthreads - 1)
1016 myend = mybegin + stride_i;
1017 else
1018 myend = ROW;
1019 for (i = mybegin; i < myend; ++i) {
1020 // get the diagonal sub-blocks
1021 for (k = IA[i]; k < IA[i + 1]; ++k) {
1022 if (JA[k] == i) {
1023 m = k * 9;
1024 memcpy(diaginv + i * 9, val + m, 9 * sizeof(REAL));
1025 fasp_smat_identity_nc3(valb + m);
1026 }
1027 }
1028 // compute the inverses of the diagonal sub-blocks
1029 fasp_smat_inv_nc3(diaginv + i * 9);
1030 // compute D^{-1}*A
1031 for (k = IA[i]; k < IA[i + 1]; ++k) {
1032 m = k * 9;
1033 j = JA[k];
1034 if (j != i)
1035 fasp_blas_smat_mul_nc3(diaginv + i * 9, val + m,
1036 valb + m);
1037 }
1038 } // end of main loop
1039 }
1040#endif
1041 }
1042
1043 else {
1044 for (i = 0; i < ROW; ++i) {
1045 // get the diagonal sub-blocks
1046 for (k = IA[i]; k < IA[i + 1]; ++k) {
1047 if (JA[k] == i) {
1048 m = k * 9;
1049 memcpy(diaginv + i * 9, val + m, 9 * sizeof(REAL));
1050 fasp_smat_identity_nc3(valb + m);
1051 }
1052 }
1053#if DEBUG_MODE > 0
1054 printf("### DEBUG: row, col = %d\n", i);
1055#endif
1056 // compute the inverses of the diagonal sub-blocks
1057 fasp_smat_inv_nc3(diaginv + i * 9);
1058
1059 // compute D^{-1}*A
1060 for (k = IA[i]; k < IA[i + 1]; ++k) {
1061 m = k * 9;
1062 j = JA[k];
1063 if (j != i)
1064 fasp_blas_smat_mul_nc3(diaginv + i * 9, val + m, valb + m);
1065 }
1066 } // end of main loop
1067 }
1068
1069 break;
1070
1071 case -5:
1072 // main loop
1073 if (use_openmp) {
1074#ifdef _OPENMP
1075 stride_i = ROW / nthreads;
1076#pragma omp parallel private(myid, mybegin, myend, i, k, m, j) num_threads(nthreads)
1077 {
1078 myid = omp_get_thread_num();
1079 mybegin = myid * stride_i;
1080 if (myid < nthreads - 1)
1081 myend = mybegin + stride_i;
1082 else
1083 myend = ROW;
1084 for (i = mybegin; i < myend; ++i) {
1085 // get the diagonal sub-blocks
1086 for (k = IA[i]; k < IA[i + 1]; ++k) {
1087 if (JA[k] == i) {
1088 m = k * 25;
1089 memcpy(diaginv + i * 25, val + m, 25 * sizeof(REAL));
1090 fasp_smat_identity_nc5(valb + m);
1091 }
1092 }
1093
1094 // compute the inverses of the diagonal sub-blocks
1095 fasp_smat_inv_nc5(diaginv + i * 25);
1096
1097 // compute D^{-1}*A
1098 for (k = IA[i]; k < IA[i + 1]; ++k) {
1099 m = k * 25;
1100 j = JA[k];
1101 if (j != i)
1102 fasp_blas_smat_mul_nc5(diaginv + i * 25, val + m,
1103 valb + m);
1104 }
1105 } // end of main loop
1106 }
1107#endif
1108 }
1109
1110 else {
1111
1112 for (i = 0; i < ROW; ++i) {
1113 // get the diagonal sub-blocks
1114 for (k = IA[i]; k < IA[i + 1]; ++k) {
1115 if (JA[k] == i) {
1116 m = k * 25;
1117 memcpy(diaginv + i * 25, val + m, 25 * sizeof(REAL));
1118 fasp_smat_identity_nc5(valb + m);
1119 }
1120 }
1121
1122 // compute the inverses of the diagonal sub-blocks
1123 // fasp_smat_inv_nc5(diaginv+i*25); // Not numerically stable!!!
1124 // --zcs 04/26/2021
1125 fasp_smat_invp_nc(diaginv + i * 25, 5);
1126
1127#if 0
1128 REAL aa[25], bb[25]; // for debug inverse of diag
1129 for (k = 0; k < 25; k++) bb[k] = diaginv[i * 25 + k]; // before inversion
1130 for (k = 0; k < 25; k++) aa[k] = diaginv[i * 25 + k]; // aftger inversion
1131
1132 printf("### DEBUG: Check inverse matrix...\n");
1133 printf("##----------------------------------------------\n");
1134 printf("## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1135 bb[0]* aa[0] + bb[1] * aa[5] + bb[2] * aa[10] + bb[3] * aa[15] + bb[4] * aa[20],
1136 bb[0]* aa[1] + bb[1] * aa[6] + bb[2] * aa[11] + bb[3] * aa[16] + bb[4] * aa[21],
1137 bb[0]* aa[2] + bb[1] * aa[7] + bb[2] * aa[12] + bb[3] * aa[17] + bb[4] * aa[22],
1138 bb[0]* aa[3] + bb[1] * aa[8] + bb[2] * aa[13] + bb[3] * aa[18] + bb[4] * aa[23],
1139 bb[0]* aa[4] + bb[1] * aa[9] + bb[3] * aa[14] + bb[3] * aa[19] + bb[4] * aa[24]);
1140 printf("## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1141 bb[5]* aa[0] + bb[6] * aa[5] + bb[7] * aa[10] + bb[8] * aa[15] + bb[9] * aa[20],
1142 bb[5]* aa[1] + bb[6] * aa[6] + bb[7] * aa[11] + bb[8] * aa[16] + bb[9] * aa[21],
1143 bb[5]* aa[2] + bb[6] * aa[7] + bb[7] * aa[12] + bb[8] * aa[17] + bb[9] * aa[22],
1144 bb[5]* aa[3] + bb[6] * aa[8] + bb[7] * aa[13] + bb[8] * aa[18] + bb[9] * aa[23],
1145 bb[5]* aa[4] + bb[6] * aa[9] + bb[7] * aa[14] + bb[8] * aa[19] + bb[9] * aa[24]);
1146 printf("## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1147 bb[10]* aa[0] + bb[11] * aa[5] + bb[12] * aa[10] + bb[13] * aa[15] + bb[14] * aa[20],
1148 bb[10]* aa[1] + bb[11] * aa[6] + bb[12] * aa[11] + bb[13] * aa[16] + bb[14] * aa[21],
1149 bb[10]* aa[2] + bb[11] * aa[7] + bb[12] * aa[12] + bb[13] * aa[17] + bb[14] * aa[22],
1150 bb[10]* aa[3] + bb[11] * aa[8] + bb[12] * aa[13] + bb[13] * aa[18] + bb[14] * aa[23],
1151 bb[10]* aa[4] + bb[11] * aa[9] + bb[12] * aa[14] + bb[13] * aa[19] + bb[14] * aa[24]);
1152 printf("## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1153 bb[15]* aa[0] + bb[16] * aa[5] + bb[17] * aa[10] + bb[18] * aa[15] + bb[19] * aa[20],
1154 bb[15]* aa[1] + bb[16] * aa[6] + bb[17] * aa[11] + bb[18] * aa[16] + bb[19] * aa[21],
1155 bb[15]* aa[2] + bb[16] * aa[7] + bb[17] * aa[12] + bb[18] * aa[17] + bb[19] * aa[22],
1156 bb[15]* aa[3] + bb[16] * aa[8] + bb[17] * aa[13] + bb[18] * aa[18] + bb[19] * aa[23],
1157 bb[15]* aa[4] + bb[16] * aa[9] + bb[17] * aa[14] + bb[18] * aa[19] + bb[19] * aa[24]);
1158 printf("## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1159 bb[20]* aa[0] + bb[21] * aa[5] + bb[22] * aa[10] + bb[23] * aa[15] + bb[24] * aa[20],
1160 bb[20]* aa[1] + bb[21] * aa[6] + bb[22] * aa[11] + bb[23] * aa[16] + bb[24] * aa[21],
1161 bb[20]* aa[2] + bb[21] * aa[7] + bb[22] * aa[12] + bb[23] * aa[17] + bb[24] * aa[22],
1162 bb[20]* aa[3] + bb[21] * aa[8] + bb[22] * aa[13] + bb[23] * aa[18] + bb[24] * aa[23],
1163 bb[20]* aa[4] + bb[21] * aa[9] + bb[22] * aa[14] + bb[23] * aa[19] + bb[24] * aa[24]);
1164 printf("##----------------------------------------------\n");
1165#endif
1166
1167 // compute D^{-1}*A
1168 for (k = IA[i]; k < IA[i + 1]; ++k) {
1169 m = k * 25;
1170 j = JA[k];
1171 if (j != i)
1172 fasp_blas_smat_mul_nc5(diaginv + i * 25, val + m, valb + m);
1173 }
1174 } // end of main loop
1175 }
1176
1177 break;
1178
1179 case -7:
1180 // main loop
1181 if (use_openmp) {
1182#ifdef _OPENMP
1183 stride_i = ROW / nthreads;
1184#pragma omp parallel private(myid, mybegin, myend, i, k, m, j) num_threads(nthreads)
1185 {
1186 myid = omp_get_thread_num();
1187 mybegin = myid * stride_i;
1188 if (myid < nthreads - 1)
1189 myend = mybegin + stride_i;
1190 else
1191 myend = ROW;
1192 for (i = mybegin; i < myend; ++i) {
1193 // get the diagonal sub-blocks
1194 for (k = IA[i]; k < IA[i + 1]; ++k) {
1195 if (JA[k] == i) {
1196 m = k * 49;
1197 memcpy(diaginv + i * 49, val + m, 49 * sizeof(REAL));
1198 fasp_smat_identity_nc7(valb + m);
1199 }
1200 }
1201
1202 // compute the inverses of the diagonal sub-blocks
1203 fasp_smat_inv_nc7(diaginv + i * 49);
1204
1205 // compute D^{-1}*A
1206 for (k = IA[i]; k < IA[i + 1]; ++k) {
1207 m = k * 49;
1208 j = JA[k];
1209 if (j != i)
1210 fasp_blas_smat_mul_nc7(diaginv + i * 49, val + m,
1211 valb + m);
1212 }
1213 } // end of main loop
1214 }
1215#endif
1216 }
1217
1218 else {
1219 for (i = 0; i < ROW; ++i) {
1220 // get the diagonal sub-blocks
1221 for (k = IA[i]; k < IA[i + 1]; ++k) {
1222 if (JA[k] == i) {
1223 m = k * 49;
1224 memcpy(diaginv + i * 49, val + m, 49 * sizeof(REAL));
1225 fasp_smat_identity_nc7(valb + m);
1226 }
1227 }
1228
1229 // compute the inverses of the diagonal sub-blocks
1230 // fasp_smat_inv_nc7(diaginv+i*49); // Not numerically stable!!!
1231 // --zcs 04/26/2021
1232 fasp_smat_invp_nc(diaginv + i * 49, 7);
1233
1234 // compute D^{-1}*A
1235 for (k = IA[i]; k < IA[i + 1]; ++k) {
1236 m = k * 49;
1237 j = JA[k];
1238 if (j != i)
1239 fasp_blas_smat_mul_nc7(diaginv + i * 49, val + m, valb + m);
1240 }
1241 } // end of main loop
1242 }
1243
1244 break;
1245
1246 default:
1247 // main loop
1248 if (use_openmp) {
1249#ifdef _OPENMP
1250 stride_i = ROW / nthreads;
1251#pragma omp parallel private(myid, mybegin, myend, i, k, m, j) num_threads(nthreads)
1252 {
1253 myid = omp_get_thread_num();
1254 mybegin = myid * stride_i;
1255 if (myid < nthreads - 1)
1256 myend = mybegin + stride_i;
1257 else
1258 myend = ROW;
1259 for (i = mybegin; i < myend; ++i) {
1260 // get the diagonal sub-blocks
1261 for (k = IA[i]; k < IA[i + 1]; ++k) {
1262 if (JA[k] == i) {
1263 m = k * nb2;
1264 memcpy(diaginv + i * nb2, val + m, nb2 * sizeof(REAL));
1265 fasp_smat_identity(valb + m, nb, nb2);
1266 }
1267 }
1268
1269 // compute the inverses of the diagonal sub-blocks
1270 fasp_smat_inv(diaginv + i * nb2, nb);
1271
1272 // compute D^{-1}*A
1273 for (k = IA[i]; k < IA[i + 1]; ++k) {
1274 m = k * nb2;
1275 j = JA[k];
1276 if (j != i)
1277 fasp_blas_smat_mul(diaginv + i * nb2, val + m, valb + m,
1278 nb);
1279 }
1280 } // end of main loop
1281 }
1282#endif
1283 } else {
1284 for (i = 0; i < ROW; ++i) {
1285 // get the diagonal sub-blocks
1286 for (k = IA[i]; k < IA[i + 1]; ++k) {
1287 if (JA[k] == i) {
1288 m = k * nb2;
1289 memcpy(diaginv + i * nb2, val + m, nb2 * sizeof(REAL));
1290 fasp_smat_identity(valb + m, nb, nb2);
1291 }
1292 }
1293
1294 // compute the inverses of the diagonal sub-blocks
1295 // fasp_smat_inv(diaginv+i*nb2, nb); // Not numerically stable!!!
1296 // --zcs 04/26/2021
1297 fasp_smat_invp_nc(diaginv + i * nb2, nb);
1298
1299 // compute D^{-1}*A
1300 for (k = IA[i]; k < IA[i + 1]; ++k) {
1301 m = k * nb2;
1302 j = JA[k];
1303 if (j != i)
1304 fasp_blas_smat_mul(diaginv + i * nb2, val + m, valb + m,
1305 nb);
1306 }
1307 } // end of main loop
1308 }
1309
1310 break;
1311 }
1312
1313 return (B);
1314}
1315
1336{
1337 // members of A
1338 const INT ROW = A->ROW;
1339 const INT COL = A->COL;
1340 const INT NNZ = A->NNZ;
1341 const INT nb = A->nb;
1342 const INT nb2 = nb * nb;
1343 const INT* IA = A->IA;
1344 const INT* JA = A->JA;
1345 REAL* val = A->val;
1346
1347 dBSRmat B;
1348 INT* IAb = NULL;
1349 INT* JAb = NULL;
1350 REAL* valb = NULL;
1351
1352 INT i, k, m;
1353 INT ibegin, iend;
1354
1355 // Variables for OpenMP
1356 SHORT nthreads = 1, use_openmp = FALSE;
1357 INT myid, mybegin, myend;
1358
1359#ifdef _OPENMP
1360 if (ROW > OPENMP_HOLDS) {
1361 use_openmp = TRUE;
1362 nthreads = fasp_get_num_threads();
1363 }
1364#endif
1365
1366 // Create a dBSRmat 'B'
1367 B = fasp_dbsr_create(ROW, COL, NNZ, nb, 0);
1368
1369 IAb = B.IA;
1370 JAb = B.JA;
1371 valb = B.val;
1372
1373 if (IAb)
1374 memcpy(IAb, IA, (ROW + 1) * sizeof(INT));
1375 else
1376 goto FINISHED;
1377
1378 if (JAb)
1379 memcpy(JAb, JA, NNZ * sizeof(INT));
1380 else
1381 goto FINISHED;
1382
1383 switch (nb) {
1384
1385 case 2:
1386 if (use_openmp) {
1387#ifdef _openmp
1388#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1389#endif
1390 for (myid = 0; myid < nthreads; myid++) {
1391 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1392 for (i = mybegin; i < myend; ++i) {
1393 ibegin = IA[i];
1394 iend = IA[i + 1];
1395 // get the diagonal sub-blocks (It is the first block of each
1396 // row)
1397 m = ibegin * 4;
1398 memcpy(diaginv + i * 4, val + m, 4 * sizeof(REAL));
1399 fasp_smat_identity_nc2(valb + m);
1400
1401 // compute the inverses of the diagonal sub-blocks
1402 fasp_smat_inv_nc2(diaginv + i * 4); // fixed by Zheng Li
1403
1404 // compute D^{-1}*A
1405 for (k = ibegin + 1; k < iend; ++k) {
1406 m = k * 4;
1407 fasp_blas_smat_mul_nc2(diaginv + i * 4, val + m, valb + m);
1408 }
1409 }
1410 } // end of main loop
1411 } else {
1412 for (i = 0; i < ROW; ++i) {
1413 ibegin = IA[i];
1414 iend = IA[i + 1];
1415 // get the diagonal sub-blocks (It is the first block of each row)
1416 m = ibegin * 4;
1417 memcpy(diaginv + i * 4, val + m, 4 * sizeof(REAL));
1418 fasp_smat_identity_nc2(valb + m);
1419
1420 // compute the inverses of the diagonal sub-blocks
1421 fasp_smat_inv_nc2(diaginv + i * 4); // fixed by Zheng Li
1422
1423 // compute D^{-1}*A
1424 for (k = ibegin + 1; k < iend; ++k) {
1425 m = k * 4;
1426 fasp_blas_smat_mul_nc2(diaginv + i * 4, val + m, valb + m);
1427 }
1428 } // end of main loop
1429 }
1430
1431 break;
1432
1433 case 3:
1434 if (use_openmp) {
1435#ifdef _openmp
1436#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1437#endif
1438 for (myid = 0; myid < nthreads; myid++) {
1439 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1440 for (i = mybegin; i < myend; ++i) {
1441 ibegin = IA[i];
1442 iend = IA[i + 1];
1443 // get the diagonal sub-blocks (It is the first block of each
1444 // row)
1445 m = ibegin * 9;
1446 memcpy(diaginv + i * 9, val + m, 9 * sizeof(REAL));
1447 fasp_smat_identity_nc3(valb + m);
1448 // compute the inverses of the diagonal sub-blocks
1449 fasp_smat_inv_nc3(diaginv + i * 9);
1450 // compute D^{-1}*A
1451 for (k = ibegin + 1; k < iend; ++k) {
1452 m = k * 9;
1453 fasp_blas_smat_mul_nc3(diaginv + i * 9, val + m, valb + m);
1454 }
1455 }
1456 } // end of main loop
1457 } else {
1458 for (i = 0; i < ROW; ++i) {
1459 ibegin = IA[i];
1460 iend = IA[i + 1];
1461 // get the diagonal sub-blocks (It is the first block of each row)
1462 m = ibegin * 9;
1463 memcpy(diaginv + i * 9, val + m, 9 * sizeof(REAL));
1464 fasp_smat_identity_nc3(valb + m);
1465
1466 // compute the inverses of the diagonal sub-blocks
1467 fasp_smat_inv_nc3(diaginv + i * 9);
1468
1469 // compute D^{-1}*A
1470 for (k = ibegin + 1; k < iend; ++k) {
1471 m = k * 9;
1472 fasp_blas_smat_mul_nc3(diaginv + i * 9, val + m, valb + m);
1473 }
1474 } // end of main loop
1475 }
1476
1477 break;
1478
1479 case 5:
1480 if (use_openmp) {
1481#ifdef _OPENMP
1482#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1483#endif
1484 for (myid = 0; myid < nthreads; myid++) {
1485 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1486 for (i = mybegin; i < myend; ++i) {
1487 // get the diagonal sub-blocks
1488 ibegin = IA[i];
1489 iend = IA[i + 1];
1490 m = ibegin * 25;
1491 memcpy(diaginv + i * 25, val + m, 25 * sizeof(REAL));
1492 fasp_smat_identity_nc5(valb + m);
1493
1494 // compute the inverses of the diagonal sub-blocks
1495 fasp_smat_inv_nc5(diaginv + i * 25);
1496
1497 // compute D^{-1}*A
1498 for (k = ibegin + 1; k < iend; ++k) {
1499 m = k * 25;
1500 fasp_blas_smat_mul_nc5(diaginv + i * 25, val + m, valb + m);
1501 }
1502 }
1503 }
1504 } else {
1505 for (i = 0; i < ROW; ++i) {
1506 // get the diagonal sub-blocks
1507 ibegin = IA[i];
1508 iend = IA[i + 1];
1509 m = ibegin * 25;
1510 memcpy(diaginv + i * 25, val + m, 25 * sizeof(REAL));
1511 fasp_smat_identity_nc5(valb + m);
1512
1513 // compute the inverses of the diagonal sub-blocks
1514 fasp_smat_inv_nc5(diaginv + i * 25);
1515
1516 // compute D^{-1}*A
1517 for (k = ibegin + 1; k < iend; ++k) {
1518 m = k * 25;
1519 fasp_blas_smat_mul_nc5(diaginv + i * 25, val + m, valb + m);
1520 }
1521 } // end of main loop
1522 }
1523 break;
1524
1525 case 7:
1526 if (use_openmp) {
1527#ifdef _OPENMP
1528#pragma omp parallel for private(myid, i, mybegin, myend, ibegin, iend, m, k)
1529#endif
1530 for (myid = 0; myid < nthreads; myid++) {
1531 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1532 for (i = mybegin; i < myend; ++i) {
1533 // get the diagonal sub-blocks
1534 ibegin = IA[i];
1535 iend = IA[i + 1];
1536 m = ibegin * 49;
1537 memcpy(diaginv + i * 49, val + m, 49 * sizeof(REAL));
1538 fasp_smat_identity_nc7(valb + m);
1539
1540 // compute the inverses of the diagonal sub-blocks
1541 fasp_smat_inv_nc7(diaginv + i * 49);
1542
1543 // compute D^{-1}*A
1544 for (k = ibegin + 1; k < iend; ++k) {
1545 m = k * 49;
1546 fasp_blas_smat_mul_nc7(diaginv + i * 49, val + m, valb + m);
1547 }
1548 }
1549 } // end of main loop
1550 } else {
1551 for (i = 0; i < ROW; ++i) {
1552 // get the diagonal sub-blocks
1553 ibegin = IA[i];
1554 iend = IA[i + 1];
1555 m = ibegin * 49;
1556 memcpy(diaginv + i * 49, val + m, 49 * sizeof(REAL));
1557 fasp_smat_identity_nc7(valb + m);
1558
1559 // compute the inverses of the diagonal sub-blocks
1560 fasp_smat_inv_nc7(diaginv + i * 49);
1561
1562 // compute D^{-1}*A
1563 for (k = ibegin + 1; k < iend; ++k) {
1564 m = k * 49;
1565 fasp_blas_smat_mul_nc7(diaginv + i * 49, val + m, valb + m);
1566 }
1567 } // end of main loop
1568 }
1569
1570 break;
1571
1572 default:
1573 if (use_openmp) {
1574#ifdef _OPENMP
1575#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1576#endif
1577 for (myid = 0; myid < nthreads; myid++) {
1578 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1579 for (i = mybegin; i < myend; ++i) {
1580 // get the diagonal sub-blocks
1581 ibegin = IA[i];
1582 iend = IA[i + 1];
1583 m = ibegin * nb2;
1584 memcpy(diaginv + i * nb2, val + m, nb2 * sizeof(REAL));
1585 fasp_smat_identity(valb + m, nb, nb2);
1586
1587 // compute the inverses of the diagonal sub-blocks
1588 fasp_smat_inv(diaginv + i * nb2, nb);
1589
1590 // compute D^{-1}*A
1591 for (k = ibegin + 1; k < iend; ++k) {
1592 m = k * nb2;
1593 fasp_blas_smat_mul(diaginv + i * nb2, val + m, valb + m,
1594 nb);
1595 }
1596 }
1597 } // end of main loop
1598 } else {
1599 for (i = 0; i < ROW; ++i) {
1600 // get the diagonal sub-blocks
1601 ibegin = IA[i];
1602 iend = IA[i + 1];
1603 m = ibegin * nb2;
1604 memcpy(diaginv + i * nb2, val + m, nb2 * sizeof(REAL));
1605 fasp_smat_identity(valb + m, nb, nb2);
1606
1607 // compute the inverses of the diagonal sub-blocks
1608 fasp_smat_inv(diaginv + i * nb2, nb);
1609
1610 // compute D^{-1}*A
1611 for (k = ibegin + 1; k < iend; ++k) {
1612 m = k * nb2;
1613 fasp_blas_smat_mul(diaginv + i * nb2, val + m, valb + m, nb);
1614 }
1615 } // end of main loop
1616 }
1617
1618 break;
1619 }
1620
1621FINISHED:
1622 return (B);
1623}
1624
1641void fasp_dbsr_getdiag(INT n, const dBSRmat* A, REAL* diag)
1642{
1643 const INT nb2 = A->nb * A->nb;
1644
1645 INT i, k;
1646
1647 if (n == 0 || n > A->ROW || n > A->COL) n = MIN(A->ROW, A->COL);
1648
1649#ifdef _OPENMP
1650#pragma omp parallel for private(i, k) if (n > OPENMP_HOLDS)
1651#endif
1652 for (i = 0; i < n; ++i) {
1653 for (k = A->IA[i]; k < A->IA[i + 1]; ++k) {
1654 if (A->JA[k] == i) {
1655 memcpy(diag + i * nb2, A->val + k * nb2, nb2 * sizeof(REAL));
1656 break;
1657 }
1658 }
1659 }
1660}
1661
1678{
1679
1680 // members of A
1681 const INT ROW = A->ROW;
1682 const INT ROW_plus_one = ROW + 1;
1683 const INT COL = A->COL;
1684 const INT NNZ = A->NNZ;
1685 const INT nb = A->nb;
1686
1687 const INT* IA = A->IA;
1688 const INT* JA = A->JA;
1689 const REAL* val = A->val;
1690
1691 INT* IAb = NULL;
1692 INT* JAb = NULL;
1693 REAL* valb = NULL;
1694
1695 INT nb2 = nb * nb;
1696 INT i, j, k;
1697
1698 // Create a dBSRmat 'B'
1699 dBSRmat B = fasp_dbsr_create(ROW, COL, NNZ, nb, 0);
1700
1701 IAb = B.IA;
1702 JAb = B.JA;
1703 valb = B.val;
1704
1705 fasp_iarray_cp(ROW_plus_one, IA, IAb);
1706 fasp_iarray_cp(NNZ, JA, JAb);
1707
1708 // work array
1709 REAL* temp = (REAL*)fasp_mem_calloc(nb2, sizeof(REAL));
1710
1711 // get DL and DU
1712 switch (nb) {
1713
1714 case 2:
1715
1716 for (i = 0; i < ROW; i++) {
1717
1718 for (j = IA[i]; j < IA[i + 1]; j++) {
1719
1720 if (JA[j] == i) {
1721
1722 temp[0] = val[j * nb2];
1723 temp[1] = val[j * nb2 + 1];
1724 temp[2] = val[j * nb2 + 2];
1725 temp[3] = val[j * nb2 + 3];
1726
1727 // form DL
1728 DL[i * nb2] = 1.0;
1729 DL[i * nb2 + 1] = 0.0;
1730 DL[i * nb2 + 2] = -temp[2] / temp[0];
1731 DL[i * nb2 + 3] = 1.0;
1732 // DL[i*nb2+2] = -temp[2]/(temp[0]*s);
1733 // DL[i*nb2+3] = 1.0/s;
1734
1735 // form DU
1736 DU[i * nb2] = 1.0;
1737 DU[i * nb2 + 1] = -temp[1] / temp[0];
1738 DU[i * nb2 + 2] = 0.0;
1739 DU[i * nb2 + 3] = 1.0;
1740
1741 break;
1742
1743 } // end of if (JA[j] == i)
1744
1745 } // end of for (j=IA[i]; j<IA[i+1]; j++)
1746
1747 } // end of for (i=0; i<ROW; i++)
1748
1749 break;
1750
1751 case 3:
1752
1753 for (i = 0; i < ROW; i++) {
1754
1755 for (j = IA[i]; j < IA[i + 1]; j++) {
1756
1757 if (JA[j] == i) {
1758
1759 temp[0] = val[j * nb2];
1760 temp[1] = val[j * nb2 + 1];
1761 temp[2] = val[j * nb2 + 2];
1762 temp[3] = val[j * nb2 + 3];
1763 temp[4] = val[j * nb2 + 4];
1764 temp[5] = val[j * nb2 + 5];
1765 temp[6] = val[j * nb2 + 6];
1766 temp[7] = val[j * nb2 + 7];
1767 temp[8] = val[j * nb2 + 8];
1768
1769 // some auxiliry variables
1770 REAL s22 = temp[4] - ((temp[1] * temp[3]) / temp[0]);
1771 REAL s23 = temp[5] - ((temp[2] * temp[3]) / temp[0]);
1772 REAL s32 = temp[7] - ((temp[1] * temp[6]) / temp[0]);
1773
1774 // form DL
1775 DL[i * nb2] = 1.0;
1776 DL[i * nb2 + 1] = 0.0;
1777 DL[i * nb2 + 2] = 0.0;
1778 DL[i * nb2 + 3] = -temp[3] / temp[0];
1779 DL[i * nb2 + 4] = 1.0;
1780 DL[i * nb2 + 5] = 0.0;
1781 DL[i * nb2 + 6] =
1782 -temp[6] / temp[0] + (temp[3] / temp[0]) * (s32 / s22);
1783 DL[i * nb2 + 7] = -s32 / s22;
1784 DL[i * nb2 + 8] = 1.0;
1785
1786 // form DU
1787 DU[i * nb2] = 1.0;
1788 DU[i * nb2 + 1] = -temp[1] / temp[0];
1789 DU[i * nb2 + 2] =
1790 -temp[2] / temp[0] + (temp[1] / temp[0]) * (s23 / s22);
1791 DU[i * nb2 + 3] = 0.0;
1792 DU[i * nb2 + 4] = 1.0;
1793 DU[i * nb2 + 5] = -s23 / s22;
1794 DU[i * nb2 + 6] = 0.0;
1795 DU[i * nb2 + 7] = 0.0;
1796 DU[i * nb2 + 8] = 1.0;
1797
1798 break;
1799
1800 } // end of if (JA[j] == i)
1801
1802 } // end of for (j=IA[i]; j<IA[i+1]; j++)
1803
1804 } // end of for (i=0; i<ROW; i++)
1805
1806 break;
1807
1808 default:
1809 printf("### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
1810 break;
1811
1812 } // end of switch
1813
1814 // compute B = DL*A*DU
1815 switch (nb) {
1816
1817 case 2:
1818
1819 for (i = 0; i < ROW; i++) {
1820
1821 for (j = IA[i]; j < IA[i + 1]; j++) {
1822
1823 k = JA[j];
1824
1825 // left multiply DL
1826 fasp_blas_smat_mul_nc2(DL + i * nb2, val + j * nb2, temp);
1827
1828 // right multiply DU
1829 fasp_blas_smat_mul_nc2(temp, DU + k * nb2, valb + j * nb2);
1830
1831 // for diagonal block, set it to be diagonal matrix
1832 if (JA[j] == i) {
1833
1834 valb[j * nb2 + 1] = 0.0;
1835 valb[j * nb2 + 2] = 0.0;
1836
1837 } // end if (JA[j] == i)
1838
1839 } // end for (j=IA[i]; j<IA[i+1]; j++)
1840
1841 } // end of for (i=0; i<ROW; i++)
1842
1843 break;
1844
1845 case 3:
1846
1847 for (i = 0; i < ROW; i++) {
1848
1849 for (j = IA[i]; j < IA[i + 1]; j++) {
1850
1851 k = JA[j];
1852
1853 // left multiply DL
1854 fasp_blas_smat_mul_nc3(DL + i * nb2, val + j * nb2, temp);
1855
1856 // right multiply DU
1857 fasp_blas_smat_mul_nc3(temp, DU + k * nb2, valb + j * nb2);
1858
1859 // for diagonal block, set it to be diagonal matrix
1860 if (JA[j] == i) {
1861
1862 valb[j * nb2 + 1] = 0.0;
1863 valb[j * nb2 + 2] = 0.0;
1864 valb[j * nb2 + 3] = 0.0;
1865 valb[j * nb2 + 5] = 0.0;
1866 valb[j * nb2 + 6] = 0.0;
1867 valb[j * nb2 + 7] = 0.0;
1868 if (ABS(valb[j * nb2 + 4]) < SMALLREAL)
1869 valb[j * nb2 + 4] = SMALLREAL;
1870 if (ABS(valb[j * nb2 + 8]) < SMALLREAL)
1871 valb[j * nb2 + 8] = SMALLREAL;
1872
1873 } // end if (JA[j] == i)
1874
1875 } // end for (j=IA[i]; j<IA[i+1]; j++)
1876
1877 } // end of for (i=0; i<ROW; i++)
1878
1879 break;
1880
1881 default:
1882 printf("### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
1883 break;
1884 }
1885
1886 // return
1887 return B;
1888}
1889
1906{
1907
1908 // members of A
1909 INT ROW = A->ROW;
1910 INT ROW_plus_one = ROW + 1;
1911 INT COL = A->COL;
1912 INT NNZ = A->NNZ;
1913 INT nb = A->nb;
1914 INT* IA = A->IA;
1915 INT* JA = A->JA;
1916 REAL* val = A->val;
1917
1918 INT* IAb = NULL;
1919 INT* JAb = NULL;
1920 REAL* valb = NULL;
1921
1922 INT nb2 = nb * nb;
1923 INT i, j, k;
1924
1925 REAL sqt3, sqt4, sqt8;
1926
1927 // Create a dBSRmat 'B'
1928 dBSRmat B = fasp_dbsr_create(ROW, COL, NNZ, nb, 0);
1929
1930 REAL* temp = (REAL*)fasp_mem_calloc(nb * nb, sizeof(REAL));
1931
1932 IAb = B.IA;
1933 JAb = B.JA;
1934 valb = B.val;
1935
1936 fasp_iarray_cp(ROW_plus_one, IA, IAb);
1937 fasp_iarray_cp(NNZ, JA, JAb);
1938
1939 // get DL and DU
1940 switch (nb) {
1941 case 2:
1942 for (i = 0; i < ROW; i++) {
1943 for (j = IA[i]; j < IA[i + 1]; j++) {
1944 if (JA[j] == i) {
1945 REAL temp0 = val[j * nb2];
1946 REAL temp1 = val[j * nb2 + 1];
1947 REAL temp2 = val[j * nb2 + 2];
1948 REAL temp3 = val[j * nb2 + 3];
1949
1950 if (ABS(temp3) < SMALLREAL) temp3 = SMALLREAL;
1951
1952 sqt3 = sqrt(ABS(temp3));
1953
1954 // form DL
1955 DL[i * nb2] = 1.0;
1956 DL[i * nb2 + 1] = 0.0;
1957 DL[i * nb2 + 2] = -temp2 / temp0 / sqt3;
1958 DL[i * nb2 + 3] = 1.0 / sqt3;
1959
1960 // form DU
1961 DU[i * nb2] = 1.0;
1962 DU[i * nb2 + 1] = -temp1 / temp0 / sqt3;
1963 DU[i * nb2 + 2] = 0.0;
1964 DU[i * nb2 + 3] = 1.0 / sqt3;
1965 break;
1966 }
1967 }
1968 }
1969
1970 break;
1971
1972 case 3:
1973 for (i = 0; i < ROW; i++) {
1974 for (j = IA[i]; j < IA[i + 1]; j++) {
1975 if (JA[j] == i) {
1976 REAL temp0 = val[j * nb2];
1977 REAL temp1 = val[j * nb2 + 1];
1978 REAL temp2 = val[j * nb2 + 2];
1979 REAL temp3 = val[j * nb2 + 3];
1980 REAL temp4 = val[j * nb2 + 4];
1981 REAL temp5 = val[j * nb2 + 5];
1982 REAL temp6 = val[j * nb2 + 6];
1983 REAL temp7 = val[j * nb2 + 7];
1984 REAL temp8 = val[j * nb2 + 8];
1985
1986 if (ABS(temp4) < SMALLREAL) temp4 = SMALLREAL;
1987 if (ABS(temp8) < SMALLREAL) temp8 = SMALLREAL;
1988
1989 sqt4 = sqrt(ABS(temp4));
1990 sqt8 = sqrt(ABS(temp8));
1991
1992 // some auxiliary variables
1993 REAL s22 = temp4 - ((temp1 * temp3) / temp0);
1994 REAL s23 = temp5 - ((temp2 * temp3) / temp0);
1995 REAL s32 = temp7 - ((temp1 * temp6) / temp0);
1996
1997 // form DL
1998 DL[i * nb2] = 1.0;
1999 DL[i * nb2 + 1] = 0.0;
2000 DL[i * nb2 + 2] = 0.0;
2001 DL[i * nb2 + 3] = -temp3 / temp0 / sqt4;
2002 DL[i * nb2 + 4] = 1.0 / sqt4;
2003 DL[i * nb2 + 5] = 0.0;
2004 DL[i * nb2 + 6] =
2005 (-temp6 / temp0 + (temp3 / temp0) * (s32 / s22)) / sqt8;
2006 DL[i * nb2 + 7] = -s32 / s22 / sqt8;
2007 DL[i * nb2 + 8] = 1.0 / sqt8;
2008
2009 // form DU
2010 DU[i * nb2] = 1.0;
2011 DU[i * nb2 + 1] = -temp1 / temp0 / sqt4;
2012 DU[i * nb2 + 2] =
2013 (-temp2 / temp0 + (temp1 / temp0) * (s23 / s22)) / sqt8;
2014 DU[i * nb2 + 3] = 0.0;
2015 DU[i * nb2 + 4] = 1.0 / sqt4;
2016 DU[i * nb2 + 5] = -s23 / s22 / sqt8;
2017 DU[i * nb2 + 6] = 0.0;
2018 DU[i * nb2 + 7] = 0.0;
2019 DU[i * nb2 + 8] = 1.0 / sqt8;
2020
2021 break;
2022 }
2023 }
2024 }
2025
2026 break;
2027
2028 default:
2029 printf("### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
2030 break;
2031
2032 } // end of switch
2033
2034 // compute B = DL*A*DU
2035 switch (nb) {
2036
2037 case 2:
2038 for (i = 0; i < ROW; i++) {
2039 for (j = IA[i]; j < IA[i + 1]; j++) {
2040 k = JA[j];
2041 // left multiply DL
2042 fasp_blas_smat_mul_nc2(DL + i * nb2, val + j * nb2, temp);
2043 // right multiply DU
2044 fasp_blas_smat_mul_nc2(temp, DU + k * nb2, valb + j * nb2);
2045 // for diagonal block, set it to be diagonal matrix
2046 if (JA[j] == i) {
2047 valb[j * nb2 + 1] = 0.0;
2048 valb[j * nb2 + 2] = 0.0;
2049 if (ABS(valb[j * nb2 + 3]) < SMALLREAL)
2050 valb[j * nb2 + 3] = SMALLREAL;
2051 }
2052 }
2053 }
2054
2055 break;
2056
2057 case 3:
2058 for (i = 0; i < ROW; i++) {
2059 for (j = IA[i]; j < IA[i + 1]; j++) {
2060 k = JA[j];
2061 // left multiply DL
2062 fasp_blas_smat_mul_nc3(DL + i * nb2, val + j * nb2, temp);
2063 // right multiply DU
2064 fasp_blas_smat_mul_nc3(temp, DU + k * nb2, valb + j * nb2);
2065 // for diagonal block, set it to be diagonal matrix
2066 if (JA[j] == i) {
2067 valb[j * nb2 + 1] = 0.0;
2068 valb[j * nb2 + 2] = 0.0;
2069 valb[j * nb2 + 3] = 0.0;
2070 valb[j * nb2 + 5] = 0.0;
2071 valb[j * nb2 + 6] = 0.0;
2072 valb[j * nb2 + 7] = 0.0;
2073 if (ABS(valb[j * nb2 + 4]) < SMALLREAL)
2074 valb[j * nb2 + 4] = SMALLREAL;
2075 if (ABS(valb[j * nb2 + 8]) < SMALLREAL)
2076 valb[j * nb2 + 8] = SMALLREAL;
2077 }
2078 }
2079 }
2080 break;
2081
2082 default:
2083 printf("### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
2084 break;
2085 }
2086
2087 // return
2088 return B;
2089}
2090
2107{
2108 const INT n = A->ROW, nnz = A->NNZ;
2109 const INT * ia = A->IA, *ja = A->JA;
2110 const REAL* Aval = A->val;
2111 const INT nb = A->nb, nb2 = nb * nb;
2112 const INT manner = A->storage_manner;
2113 SHORT nthreads = 1, use_openmp = FALSE;
2114
2115 INT i, j, k, jaj, i1, i2, start, jj;
2116
2117#ifdef _OPENMP
2118 if (MIN(n, nnz) > OPENMP_HOLDS) {
2119 use_openmp = 0; // TRUE;
2120 nthreads = fasp_get_num_threads();
2121 }
2122#endif
2123
2124 dBSRmat Aperm = fasp_dbsr_create(n, n, nnz, nb, manner);
2125
2126 // form the transpose of P
2127 INT* Pt = (INT*)fasp_mem_calloc(n, sizeof(INT));
2128
2129 if (use_openmp) {
2130 INT myid, mybegin, myend;
2131#ifdef _OPENMP
2132#pragma omp parallel for private(myid, mybegin, myend, i)
2133#endif
2134 for (myid = 0; myid < nthreads; ++myid) {
2135 fasp_get_start_end(myid, nthreads, n, &mybegin, &myend);
2136 for (i = mybegin; i < myend; ++i) Pt[P[i]] = i;
2137 }
2138 } else {
2139 for (i = 0; i < n; ++i) Pt[P[i]] = i;
2140 }
2141
2142 // compute IA of P*A (row permutation)
2143 Aperm.IA[0] = 0;
2144 for (i = 0; i < n; ++i) {
2145 k = P[i];
2146 Aperm.IA[i + 1] = Aperm.IA[i] + (ia[k + 1] - ia[k]);
2147 }
2148
2149 // perform actual P*A
2150 if (use_openmp) {
2151 INT myid, mybegin, myend;
2152#ifdef _OPENMP
2153#pragma omp parallel for private(myid, mybegin, myend, i, i1, i2, k, start, j, jaj, jj)
2154#endif
2155 for (myid = 0; myid < nthreads; ++myid) {
2156 fasp_get_start_end(myid, nthreads, n, &mybegin, &myend);
2157 for (i = mybegin; i < myend; ++i) {
2158 i1 = Aperm.IA[i];
2159 i2 = Aperm.IA[i + 1] - 1;
2160 k = P[i];
2161 start = ia[k];
2162 for (j = i1; j <= i2; ++j) {
2163 jaj = start + j - i1;
2164 Aperm.JA[j] = ja[jaj];
2165 for (jj = 0; jj < nb2; ++jj)
2166 Aperm.val[j * nb2 + jj] = Aval[jaj * nb2 + jj];
2167 }
2168 }
2169 }
2170 } else {
2171 for (i = 0; i < n; ++i) {
2172 i1 = Aperm.IA[i];
2173 i2 = Aperm.IA[i + 1] - 1;
2174 k = P[i];
2175 start = ia[k];
2176 for (j = i1; j <= i2; ++j) {
2177 jaj = start + j - i1;
2178 Aperm.JA[j] = ja[jaj];
2179 for (jj = 0; jj < nb2; ++jj)
2180 Aperm.val[j * nb2 + jj] = Aval[jaj * nb2 + jj];
2181 }
2182 }
2183 }
2184
2185 // perform P*A*P' (column permutation)
2186 if (use_openmp) {
2187 INT myid, mybegin, myend;
2188#ifdef _OPENMP
2189#pragma omp parallel for private(myid, mybegin, myend, k, j)
2190#endif
2191 for (myid = 0; myid < nthreads; ++myid) {
2192 fasp_get_start_end(myid, nthreads, nnz, &mybegin, &myend);
2193 for (k = mybegin; k < myend; ++k) {
2194 j = Aperm.JA[k];
2195 Aperm.JA[k] = Pt[j];
2196 }
2197 }
2198 } else {
2199 for (k = 0; k < nnz; ++k) {
2200 j = Aperm.JA[k];
2201 Aperm.JA[k] = Pt[j];
2202 }
2203 }
2204
2205 fasp_mem_free(Pt);
2206 Pt = NULL;
2207
2208 return (Aperm);
2209}
2210
2224{
2225 INT count = 0;
2226 const INT num_rowsA = A->ROW;
2227 const INT nb = A->nb;
2228 const INT nb2 = nb * nb;
2229 INT* A_i = A->IA;
2230 INT* A_j = A->JA;
2231 REAL* A_data = A->val;
2232
2233 INT i, ii, j, jj, ibegin, iend, iend1;
2234
2235#ifdef _OPENMP
2236 // variables for OpenMP
2237 INT myid, mybegin, myend;
2238 INT nthreads = fasp_get_num_threads();
2239#endif
2240
2241#ifdef _OPENMP
2242 if (num_rowsA > OPENMP_HOLDS) {
2243#pragma omp parallel for private(myid, mybegin, myend, i, ii, j, jj, ibegin, iend, \
2244 iend1)
2245 for (myid = 0; myid < nthreads; myid++) {
2246 fasp_get_start_end(myid, nthreads, num_rowsA, &mybegin, &myend);
2247 for (i = mybegin; i < myend; i++) {
2248 ibegin = A_i[i];
2249 iend = A_i[i + 1];
2250 iend1 = iend - 1;
2251 for (j = ibegin; j < iend1; j++) {
2252 if (A_j[j] > -1) {
2253 for (jj = j + 1; jj < iend; jj++) {
2254 if (A_j[j] == A_j[jj]) {
2255 // add jj col to j
2256 for (ii = 0; ii < nb2; ii++)
2257 A_data[j * nb2 + ii] += A_data[jj * nb2 + ii];
2258 A_j[jj] = -1;
2259 count++;
2260 }
2261 }
2262 }
2263 }
2264 }
2265 }
2266 } else {
2267#endif
2268 for (i = 0; i < num_rowsA; i++) {
2269 ibegin = A_i[i];
2270 iend = A_i[i + 1];
2271 iend1 = iend - 1;
2272 for (j = ibegin; j < iend1; j++) {
2273 if (A_j[j] > -1) {
2274 for (jj = j + 1; jj < iend; jj++) {
2275 if (A_j[j] == A_j[jj]) {
2276 // add jj col to j
2277 for (ii = 0; ii < nb2; ii++)
2278 A_data[j * nb2 + ii] += A_data[jj * nb2 + ii];
2279 printf("### WARNING: Same col indices at %d, col %d (%d "
2280 "%d)!\n",
2281 i, A_j[j], j, jj);
2282 A_j[jj] = -1;
2283 count++;
2284 }
2285 }
2286 }
2287 }
2288 }
2289#ifdef _OPENMP
2290 }
2291#endif
2292
2293 if (count > 0) {
2294 INT* tempA_i = (INT*)fasp_mem_calloc(num_rowsA + 1, sizeof(INT));
2295 memcpy(tempA_i, A_i, (num_rowsA + 1) * sizeof(INT));
2296 jj = 0;
2297 A_i[0] = jj;
2298 for (i = 0; i < num_rowsA; i++) {
2299 ibegin = tempA_i[i];
2300 iend = tempA_i[i + 1];
2301 for (j = ibegin; j < iend; j++) {
2302 if (A_j[j] > -1) {
2303 memcpy(A_data + jj * nb2, A_data + j * nb2, (nb2) * sizeof(REAL));
2304 A_j[jj] = A_j[j];
2305 jj++;
2306 }
2307 }
2308 A_i[i + 1] = jj;
2309 }
2310 A->NNZ = jj;
2311 fasp_mem_free(tempA_i);
2312 tempA_i = NULL;
2313
2314 printf("### WARNING: %d col indices have been merged!\n", count);
2315 }
2316
2317 return count;
2318}
2319
2337void dBSRmat_Multicoloring_Theta(
2338 dBSRmat* A, REAL theta, INT* rowmax, INT* groups, INT** ic_out, INT** icmap_out)
2339{
2340 INT k, i, j, pre, group;
2341 INT igold, iend, iavg;
2342 INT icount;
2343 INT front, rear;
2344 INT n = A->ROW;
2345 INT *IC, *ICMAP, color;
2346 //---------------------------------------------------------------------------
2347 iCSRmat S;
2348 INT * IA, *JA;
2349 if (theta > 0 && theta < 1.0) {
2350 generate_S_theta_bsr(A, &S, theta);
2351 IA = S.IA;
2352 JA = S.JA;
2353 } else if (theta == 1.0) {
2354
2355 IC = (INT*)malloc(sizeof(INT) * 2);
2356 ICMAP = (INT*)malloc(sizeof(INT) * (n + 1));
2357 IC[0] = 0;
2358 IC[1] = n;
2359
2360#ifdef _OPENMP
2361#pragma omp parallel for private(k)
2362#endif
2363 for (k = 0; k < n; k++) ICMAP[k] = k;
2364
2365 color = 1;
2366 *groups = 1;
2367 *rowmax = 1;
2368 printf("%s Theta = %lf \n", __FUNCTION__, theta);
2369
2370 return;
2371
2372 } else {
2373 IA = A->IA;
2374 JA = A->JA;
2375 }
2376 //---------------------------------------------------------------------------
2377 INT* cq = (INT*)malloc(sizeof(INT) * (n + 1));
2378 INT* newr = (INT*)malloc(sizeof(INT) * (n + 1));
2379
2380#ifdef _OPENMP
2381#pragma omp parallel for private(k)
2382#endif
2383 for (k = 0; k < n; k++) {
2384 cq[k] = k;
2385 }
2386 group = 0;
2387 for (k = 0; k < n; k++) {
2388 if ((A->IA[k + 1] - A->IA[k]) > group) group = A->IA[k + 1] - A->IA[k];
2389 }
2390 *rowmax = group;
2391
2392#if 0
2393 iavg = IA[n]/n ;
2394 igold = (INT)MAX(iavg,group*0.618) +1;
2395 igold = group ;
2396#endif
2397
2398 IC = (INT*)malloc(sizeof(INT) * (group + 2));
2399 ICMAP = (INT*)malloc(sizeof(INT) * (n + 1));
2400
2401 front = n - 1;
2402 rear = n - 1;
2403
2404 memset(newr, -1, sizeof(INT) * (n + 1));
2405 memset(ICMAP, 0, sizeof(INT) * n);
2406
2407 group = 0;
2408 icount = 0;
2409 IC[0] = 0;
2410 pre = 0;
2411
2412 do {
2413 // front = (front+1)%n;
2414 front++;
2415 if (front == n) front = 0; // front = front < n ? front : 0 ;
2416 i = cq[front];
2417
2418 if (i <= pre) {
2419 IC[group] = icount;
2420 ICMAP[icount] = i;
2421 group++;
2422 icount++;
2423#if 0
2424 if ((IA[i+1]-IA[i]) > igold)
2425 iend = MIN(IA[i+1], (IA[i] + igold));
2426 else
2427#endif
2428 iend = IA[i + 1];
2429 for (j = IA[i]; j < iend; j++) newr[JA[j]] = group;
2430 } else if (newr[i] == group) {
2431 // rear = (rear +1)%n;
2432 rear++;
2433 if (rear == n) rear = 0;
2434 cq[rear] = i;
2435 } else {
2436 ICMAP[icount] = i;
2437 icount++;
2438#if 0
2439 if ((IA[i+1] - IA[i]) > igold) iend =MIN(IA[i+1], (IA[i] + igold));
2440 else
2441#endif
2442 iend = IA[i + 1];
2443 for (j = IA[i]; j < iend; j++) newr[JA[j]] = group;
2444 }
2445 pre = i;
2446
2447 // printf("pre = %d\n",pre);
2448 } while (rear != front);
2449
2450 // printf("group\n");
2451 IC[group] = icount;
2452 color = group;
2453
2454#if 0
2455 for (i = 0; i < color; i++) {
2456 for (j = IC[i]; j < IC[i + 1]; j++)
2457 printf("color %d ICMAP[%d] = %d \n", i, j, ICMAP[j]);
2458 printf("color = %d row= %d %d\n", color, n, IC[i + 1] - IC[i]);
2459 getchar();
2460 }
2461 // printf(" Max Row Numbers %d avg %d igold %d max %d %d\n", group, iavg, igold,
2462 // (INT)MAX(iavg,group*0.618),A->IA[n]/n );
2463#endif
2464
2465 free(cq);
2466 free(newr);
2467 if (theta > 0) {
2468 fasp_mem_free(S.IA);
2469 fasp_mem_free(S.JA);
2470 }
2471
2472 // output
2473 *groups = group;
2474 *ic_out = IC;
2475 *icmap_out = ICMAP;
2476
2477 return;
2478}
2479
2480/*---------------------------------*/
2481/*-- Private Functions --*/
2482/*---------------------------------*/
2483
2484static double dense_matrix_max_internal(double* A, int m, int n)
2485{
2486 double norm = 0.0, sum;
2487 int i, j;
2488 for (i = 0; i < m; i++) {
2489 for (j = 0; j < n; j++) {
2490 norm = MAX(norm, fabs(A[i * m + j]));
2491 }
2492 }
2493 return (norm);
2494}
2495
2496static void generate_S_theta_bsr(dBSRmat* A, iCSRmat* S, REAL theta)
2497{
2498 const INT row = A->ROW, col = A->COL;
2499 const INT row_plus_one = row + 1;
2500 const INT nnz = A->IA[row] - A->IA[0];
2501 const INT nb = A->nb, nb2 = nb * nb;
2502
2503 INT index, i, j, begin_row, end_row;
2504 INT * ia = A->IA, *ja = A->JA;
2505 REAL* aj = A->val;
2506
2507 // get the diagnal entry of A
2508 // dvector diag; fasp_dcsr_getdiag(0, A, &diag);
2509
2510 /* generate S */
2511 REAL row_abs_sum;
2512
2513 // copy the structure of A to S
2514 S->row = row;
2515 S->col = col;
2516 S->nnz = nnz;
2517 S->val = NULL;
2518
2519 S->IA = (INT*)fasp_mem_calloc(row_plus_one, sizeof(INT));
2520
2521 S->JA = (INT*)fasp_mem_calloc(nnz, sizeof(INT));
2522
2523 fasp_iarray_cp(row_plus_one, ia, S->IA);
2524 fasp_iarray_cp(nnz, ja, S->JA);
2525
2526#ifdef _OPENMP
2527#pragma omp parallel for private(i, j, begin_row, end_row, row_abs_sum)
2528#endif
2529 for (i = 0; i < row; ++i) {
2530 /* compute scaling factor and row sum */
2531 row_abs_sum = 0;
2532 begin_row = ia[i];
2533 end_row = ia[i + 1];
2534 for (j = begin_row; j < end_row; j++) {
2535 // row_abs_sum += ABS(aj[j]);
2536 row_abs_sum += dense_matrix_max_internal(aj + j * nb2, nb, nb);
2537 }
2538 row_abs_sum = row_abs_sum * theta;
2539
2540 /* deal with the element of S */
2541 for (j = begin_row; j < end_row; j++) {
2542 // if ((row_abs_sum >= ABS(aj[j])) && (ja[j] != i)) {
2543 if ((row_abs_sum >= dense_matrix_max_internal(aj + j * nb2, nb, nb)) &&
2544 (ja[j] != i)) {
2545 S->JA[j] = -1;
2546 }
2547 }
2548 } // end for i
2549
2550 /* Compress the strength matrix */
2551 index = 0;
2552 for (i = 0; i < row; ++i) {
2553 S->IA[i] = index;
2554 begin_row = ia[i];
2555 end_row = ia[i + 1] - 1;
2556 for (j = begin_row; j <= end_row; j++) {
2557 if (S->JA[j] > -1) {
2558 S->JA[index] = S->JA[j];
2559 index++;
2560 }
2561 }
2562 }
2563
2564 if (index > 0) {
2565 S->IA[row] = index;
2566 S->nnz = index;
2567 S->JA = (INT*)fasp_mem_realloc(S->JA, index * sizeof(INT));
2568 } else {
2569 S->nnz = 0;
2570 S->JA = NULL;
2571 }
2572}
2573
2574/*---------------------------------*/
2575/*-- End of File --*/
2576/*---------------------------------*/
void fasp_iarray_set(const INT n, INT *x, const INT val)
Set initial value for an array to be x=val.
Definition: AuxArray.c:98
void fasp_iarray_cp(const INT n, const INT *x, INT *y)
Copy an array to the other y=x.
Definition: AuxArray.c:227
void fasp_mem_free(void *mem)
Free up previous allocated memory body and set pointer to NULL.
Definition: AuxMemory.c:152
void * fasp_mem_realloc(void *oldmem, const LONGLONG tsize)
Reallocate, initiate, and check memory.
Definition: AuxMemory.c:113
void * fasp_mem_calloc(const unsigned int size, const unsigned int type)
Allocate, initiate, and check memory.
Definition: AuxMemory.c:65
void fasp_get_start_end(const INT procid, const INT nprocs, const INT n, INT *start, INT *end)
Assign Load to each thread.
Definition: AuxThreads.c:93
void fasp_smat_inv_nc3(REAL *a)
Compute the inverse matrix of a 3*3 full matrix A (in place)
void fasp_smat_identity_nc7(REAL *a)
Set a 7*7 full matrix to be a identity.
void fasp_smat_identity_nc3(REAL *a)
Set a 3*3 full matrix to be a identity.
SHORT fasp_smat_invp_nc(REAL *a, const INT n)
Compute the inverse of a matrix using Gauss Elimination with Pivoting.
void fasp_smat_inv_nc7(REAL *a)
Compute the inverse matrix of a 7*7 matrix a.
void fasp_smat_inv_nc2(REAL *a)
Compute the inverse matrix of a 2*2 full matrix A (in place)
SHORT fasp_smat_inv(REAL *a, const INT n)
Compute the inverse matrix of a small full matrix a.
void fasp_smat_identity_nc2(REAL *a)
Set a 2*2 full matrix to be a identity.
void fasp_smat_inv_nc5(REAL *a)
Compute the inverse matrix of a 5*5 full matrix A (in place)
void fasp_smat_identity_nc5(REAL *a)
Set a 5*5 full matrix to be a identity.
void fasp_smat_identity(REAL *a, const INT n, const INT n2)
Set a n*n full matrix to be a identity.
void fasp_blas_smat_mul(const REAL *a, const REAL *b, REAL *c, const INT n)
Compute the matrix product of two small full matrices a and b, stored in c.
Definition: BlaSmallMat.c:596
void fasp_blas_smat_mul_nc2(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 2* matrices a and b, stored in c.
Definition: BlaSmallMat.c:289
void fasp_blas_smat_mul_nc3(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 3*3 matrices a and b, stored in c.
Definition: BlaSmallMat.c:315
void fasp_blas_smat_mul_nc7(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 7*7 matrices a and b, stored in c.
Definition: BlaSmallMat.c:452
void fasp_blas_smat_mul_nc5(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 5*5 matrices a and b, stored in c.
Definition: BlaSmallMat.c:395
dBSRmat fasp_dbsr_create(const INT ROW, const INT COL, const INT NNZ, const INT nb, const INT storage_manner)
Create BSR sparse matrix data memory space.
Definition: BlaSparseBSR.c:48
dBSRmat fasp_dbsr_diaginv(const dBSRmat *A)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
Definition: BlaSparseBSR.c:645
INT fasp_dbsr_trans(const dBSRmat *A, dBSRmat *AT)
Find A^T from given dBSRmat matrix A.
Definition: BlaSparseBSR.c:246
SHORT fasp_dbsr_getblk(const dBSRmat *A, const INT *Is, const INT *Js, const INT m, const INT n, dBSRmat *B)
Get a sub BSR matrix of A with specified rows and columns.
Definition: BlaSparseBSR.c:332
void fasp_dbsr_getdiag(INT n, const dBSRmat *A, REAL *diag)
Abstract the diagonal blocks of a BSR matrix.
dBSRmat fasp_dbsr_diagLU(const dBSRmat *A, REAL *DL, REAL *DU)
Compute B := DL*A*DU. We decompose each diagonal block of A into LDU form and DL = diag(L^{-1}) and D...
void fasp_dbsr_cp(const dBSRmat *A, dBSRmat *B)
copy a dCSRmat to a new one B=A
Definition: BlaSparseBSR.c:169
dBSRmat fasp_dbsr_diagLU2(dBSRmat *A, REAL *DL, REAL *DU)
Compute B := DL*A*DU. We decompose each diagonal block of A into LDU form and DL = diag(L^{-1}) and D...
dBSRmat fasp_dbsr_perm(const dBSRmat *A, const INT *P)
Apply permutation of A, i.e. Aperm=PAP' by the orders given in P.
void fasp_dbsr_alloc(const INT ROW, const INT COL, const INT NNZ, const INT nb, const INT storage_manner, dBSRmat *A)
Allocate memory space for BSR format sparse matrix.
Definition: BlaSparseBSR.c:96
void fasp_dbsr_free(dBSRmat *A)
Free memory space for BSR format sparse matrix.
Definition: BlaSparseBSR.c:140
SHORT fasp_dbsr_compress_inplace(dBSRmat *A, const REAL dtol)
Compress a BSR matrix A IN PLACE by dropping small entries max(|Aij|) <= dtol.
Definition: BlaSparseBSR.c:197
dBSRmat fasp_dbsr_diaginv2(const dBSRmat *A, REAL *diaginv)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
Definition: BlaSparseBSR.c:803
SHORT fasp_dbsr_diagpref(dBSRmat *A)
Reorder the column and data arrays of a square BSR matrix, so that the first entry in each row is the...
Definition: BlaSparseBSR.c:435
dBSRmat fasp_dbsr_diaginv4(const dBSRmat *A, REAL *diaginv)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
INT fasp_dbsr_merge_col(dBSRmat *A)
Check and merge some same col index in one row.
dBSRmat fasp_dbsr_diaginv3(const dBSRmat *A, REAL *diaginv)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
Definition: BlaSparseBSR.c:911
dvector fasp_dbsr_getdiaginv(const dBSRmat *A)
Get D^{-1} of matrix A.
Definition: BlaSparseBSR.c:543
Main header file for the FASP project.
#define MIN(a, b)
Definition: fasp.h:83
#define REAL
Definition: fasp.h:75
#define SHORT
FASP integer and floating point numbers.
Definition: fasp.h:71
#define ABS(a)
Definition: fasp.h:84
#define MAX(a, b)
Definition of max, min, abs.
Definition: fasp.h:82
#define INT
Definition: fasp.h:72
#define FASP_SUCCESS
Definition of return status and error messages.
Definition: fasp_const.h:19
#define OPENMP_HOLDS
Definition: fasp_const.h:269
#define TRUE
Definition of logic type.
Definition: fasp_const.h:61
#define FALSE
Definition: fasp_const.h:62
#define ERROR_INPUT_PAR
Definition: fasp_const.h:24
#define ERROR_UNKNOWN
Definition: fasp_const.h:56
#define SMALLREAL
Definition: fasp_const.h:256
Block sparse row storage matrix of REAL type.
Definition: fasp_block.h:34
INT COL
number of cols of sub-blocks in matrix A, N
Definition: fasp_block.h:40
INT NNZ
number of nonzero sub-blocks in matrix A, NNZ
Definition: fasp_block.h:43
REAL * val
Definition: fasp_block.h:57
INT nb
dimension of each sub-block
Definition: fasp_block.h:46
INT * IA
integer array of row pointers, the size is ROW+1
Definition: fasp_block.h:60
INT ROW
number of rows of sub-blocks in matrix A, M
Definition: fasp_block.h:37
INT * JA
Definition: fasp_block.h:64
INT storage_manner
storage manner for each sub-block
Definition: fasp_block.h:49
Vector with n entries of REAL type.
Definition: fasp.h:354
REAL * val
actual vector entries
Definition: fasp.h:360
INT row
number of rows
Definition: fasp.h:357
Sparse matrix of INT type in CSR format.
Definition: fasp.h:190
INT col
column of matrix A, n
Definition: fasp.h:196
INT row
row number of matrix A, m
Definition: fasp.h:193
INT * IA
integer array of row pointers, the size is m+1
Definition: fasp.h:202
INT nnz
number of nonzero entries
Definition: fasp.h:199
INT * JA
integer array of column indexes, the size is nnz
Definition: fasp.h:205
INT * val
nonzero entries of A
Definition: fasp.h:208