22#include "fasp_functs.h"
24static double dense_matrix_max_internal(
double* A,
int m,
int n);
49 const INT ROW,
const INT COL,
const INT NNZ,
const INT nb,
const INT storage_manner)
65 if (nb > 0 && NNZ > 0) {
100 const INT storage_manner,
142 if (A == NULL)
return;
177 memcpy(B->
IA, A->
IA, (A->
ROW + 1) *
sizeof(
INT));
201 const INT nb = A->
nb, nb2 = nb * nb;
204 INT ibegin, iend = A->
IA[0];
207 for (i = 0; i < row; ++i) {
210 for (j = ibegin; j < iend; ++j)
211 if (dense_matrix_max_internal(&A->
val[j * nb2], nb, nb) > dtol ||
215 memcpy(A->
val + k * nb2, A->
val + j * nb2, (nb2) *
sizeof(
REAL));
226 printf(
"### WARNING: Size of compressed matrix is bigger than original!\n");
251 INT i, j, k, p, inb, jnb, nb2;
273 for (j = 0; j < nnz; ++j) {
275 if (i < m - 1) AT->
IA[i + 2]++;
278 for (i = 2; i <= m; ++i) AT->
IA[i] += AT->
IA[i - 1];
282 for (i = 0; i < n; ++i) {
283 INT ibegin = A->
IA[i], iend1 = A->
IA[i + 1];
284 for (p = ibegin; p < iend1; p++) {
288 for (inb = 0; inb < nb; inb++)
289 for (jnb = 0; jnb < nb; jnb++)
290 AT->
val[nb2 * k + inb * nb + jnb] =
291 A->
val[nb2 * p + jnb * nb + inb];
297 for (i = 0; i < n; ++i) {
298 INT ibegin = A->
IA[i], iend1 = A->
IA[i + 1];
299 for (p = ibegin; p < iend1; p++) {
340 INT i, j, k, nnz = 0;
344 const INT nb = A->
nb;
345 const INT nb2 = nb * nb;
348 INT myid, mybegin, stride_i, myend, nthreads;
351 nthreads = fasp_get_num_threads();
367 stride_i = n / nthreads;
368#pragma omp parallel private(myid, mybegin, myend, i) num_threads(nthreads)
370 myid = omp_get_thread_num();
371 mybegin = myid * stride_i;
372 if (myid < nthreads - 1)
373 myend = mybegin + stride_i;
376 for (i = mybegin; i < myend; ++i) {
377 col_flag[Js[i]] = i + 1;
382 for (i = 0; i < n; ++i) col_flag[Js[i]] = i + 1;
387 for (i = 0; i < m; ++i) {
388 for (k = A->
IA[Is[i]]; k < A->IA[Is[i] + 1]; ++k) {
390 if (col_flag[j] > 0) nnz++;
402 for (i = 0; i < m; ++i) {
403 for (k = A->
IA[Is[i]]; k < A->IA[Is[i] + 1]; ++k) {
405 if (col_flag[j] > 0) {
406 B->
JA[nnz] = col_flag[j] - 1;
407 memcpy(B->
val + nnz * nb2, A->
val + k * nb2, nb2 *
sizeof(
REAL));
438 const INT num_rowsA = A->
ROW;
439 const INT num_colsA = A->
COL;
440 const INT nb = A->
nb;
441 const INT nb2 = nb * nb;
443 const INT* A_i = A->
IA;
447 INT i, j, tempi, row_size;
451 INT myid, mybegin, myend, ibegin, iend;
452 INT nthreads = fasp_get_num_threads();
461#pragma omp parallel for private(myid, mybegin, myend, i, j, tempi, ibegin, iend)
462 for (myid = 0; myid < nthreads; myid++) {
464 for (i = mybegin; i < myend; i++) {
467 for (j = ibegin; j < iend; j++) {
472 A_j[ibegin] = A_j[j];
475 memcpy(tempd + myid * nb2, A_data + ibegin * nb2,
476 (nb2) *
sizeof(
REAL));
477 memcpy(A_data + ibegin * nb2, A_data + j * nb2,
478 (nb2) *
sizeof(
REAL));
479 memcpy(A_data + j * nb2, tempd + myid * nb2,
480 (nb2) *
sizeof(
REAL));
497 for (i = 0; i < num_rowsA; i++) {
498 row_size = A_i[i + 1] - A_i[i];
499 for (j = 0; j < row_size; j++) {
507 memcpy(tempd, A_data, (nb2) *
sizeof(
REAL));
508 memcpy(A_data, A_data + j * nb2, (nb2) *
sizeof(
REAL));
509 memcpy(A_data + j * nb2, tempd, (nb2) *
sizeof(
REAL));
514 if (j == row_size - 1)
return -2;
517 A_data += row_size * nb2;
547 const INT nb = A->
nb;
548 const INT nb2 = nb * nb;
549 const INT size = ROW * nb2;
550 const INT* IA = A->
IA;
551 const INT* JA = A->
JA;
560 INT myid, mybegin, myend;
565 nthreads = fasp_get_num_threads();
576#pragma omp parallel for private(myid, i, mybegin, myend, k)
578 for (myid = 0; myid < nthreads; myid++) {
580 for (i = mybegin; i < myend; ++i) {
581 for (k = IA[i]; k < IA[i + 1]; ++k) {
583 memcpy(diaginv.
val + i * nb2, val + k * nb2,
589 for (i = 0; i < ROW; ++i) {
590 for (k = IA[i]; k < IA[i + 1]; ++k) {
592 memcpy(diaginv.
val + i * nb2, val + k * nb2, nb2 *
sizeof(
REAL));
599#pragma omp parallel for private(myid, i, mybegin, myend)
601 for (myid = 0; myid < nthreads; myid++) {
604 for (i = mybegin; i < myend; ++i) {
608 for (i = mybegin; i < myend; ++i) {
610 diaginv.
val[i] = 1.0 / diaginv.
val[i];
616 for (i = 0; i < ROW; ++i) {
620 for (i = 0; i < ROW; ++i) {
622 diaginv.
val[i] = 1.0 / diaginv.
val[i];
651 const INT nb = A->
nb;
652 const INT nb2 = nb * nb;
653 const INT size = ROW * nb2;
654 const INT* IA = A->
IA;
655 const INT* JA = A->
JA;
668 INT myid, mybegin, myend;
674 memcpy(IAb, IA, (ROW + 1) *
sizeof(
INT));
679 memcpy(JAb, JA, NNZ *
sizeof(
INT));
686 nthreads = fasp_get_num_threads();
693#pragma omp parallel for private(myid, i, mybegin, myend, k)
695 for (myid = 0; myid < nthreads; myid++) {
697 for (i = mybegin; i < myend; ++i) {
698 for (k = IA[i]; k < IA[i + 1]; ++k) {
700 memcpy(diaginv + i * nb2, val + k * nb2, nb2 *
sizeof(
REAL));
705 for (i = 0; i < ROW; ++i) {
706 for (k = IA[i]; k < IA[i + 1]; ++k) {
708 memcpy(diaginv + i * nb2, val + k * nb2, nb2 *
sizeof(
REAL));
716#pragma omp parallel for private(myid, i, mybegin, myend)
718 for (myid = 0; myid < nthreads; myid++) {
721 for (i = mybegin; i < myend; ++i) {
725 for (i = mybegin; i < myend; ++i) {
727 diaginv[i] = 1.0 / diaginv[i];
733 for (i = 0; i < ROW; ++i) {
737 for (i = 0; i < ROW; ++i) {
739 diaginv[i] = 1.0 / diaginv[i];
747#pragma omp parallel for private(myid, mybegin, myend, i, k, m, j, l)
749 for (myid = 0; myid < nthreads; myid++) {
751 for (i = mybegin; i < myend; ++i) {
752 for (k = IA[i]; k < IA[i + 1]; ++k) {
757 memset(valb + m, 0X0, nb2 *
sizeof(
REAL));
758 for (l = 0; l < nb; l++) valb[m + l * nb + l] = 1.0;
766 for (i = 0; i < ROW; ++i) {
767 for (k = IA[i]; k < IA[i + 1]; ++k) {
772 memset(valb + m, 0X0, nb2 *
sizeof(
REAL));
773 for (l = 0; l < nb; l++) valb[m + l * nb + l] = 1.0;
809 const INT nb = A->
nb, nbp1 = nb + 1;
810 const INT nb2 = nb * nb;
821 INT i, k, m, l, ibegin, iend;
825 INT myid, mybegin, myend;
830 nthreads = fasp_get_num_threads();
841 memcpy(IAb, IA, (ROW + 1) *
sizeof(
INT));
846 memcpy(JAb, JA, NNZ *
sizeof(
INT));
853#pragma omp parallel for private(myid, i, mybegin, myend, ibegin, iend, k, m, l)
855 for (myid = 0; myid < nthreads; myid++) {
857 for (i = mybegin; i < myend; ++i) {
860 for (k = ibegin; k < iend; ++k) {
866 memset(valb + m, 0X0, nb2 *
sizeof(
REAL));
867 for (l = 0; l < nb; l++) valb[m + l * nbp1] = 1.0;
874 for (i = 0; i < ROW; ++i) {
877 for (k = ibegin; k < iend; ++k) {
883 memset(valb + m, 0X0, nb2 *
sizeof(
REAL));
884 for (l = 0; l < nb; l++) valb[m + l * nbp1] = 1.0;
916 INT ROW_plus_one = ROW + 1;
934 INT myid, mybegin, myend, stride_i, nthreads = 1;
937 nthreads = fasp_get_num_threads();
957 stride_i = ROW / nthreads;
958#pragma omp parallel private(myid, mybegin, myend, i, k, m, j) num_threads(nthreads)
960 myid = omp_get_thread_num();
961 mybegin = myid * stride_i;
962 if (myid < nthreads - 1)
963 myend = mybegin + stride_i;
966 for (i = mybegin; i < myend; ++i) {
971 memcpy(diaginv + i * 4, val + m, 4 *
sizeof(
REAL));
977 for (k = IA[i] + 1; k < IA[i + 1]; ++k) {
987 for (i = 0; i < ROW; ++i) {
991 memcpy(diaginv + i * 4, val + m, 4 *
sizeof(
REAL));
997 for (k = IA[i] + 1; k < IA[i + 1]; ++k) {
1010 stride_i = ROW / nthreads;
1011#pragma omp parallel private(myid, mybegin, myend, i, k, m, j) num_threads(nthreads)
1013 myid = omp_get_thread_num();
1014 mybegin = myid * stride_i;
1015 if (myid < nthreads - 1)
1016 myend = mybegin + stride_i;
1019 for (i = mybegin; i < myend; ++i) {
1021 for (k = IA[i]; k < IA[i + 1]; ++k) {
1024 memcpy(diaginv + i * 9, val + m, 9 *
sizeof(
REAL));
1031 for (k = IA[i]; k < IA[i + 1]; ++k) {
1044 for (i = 0; i < ROW; ++i) {
1046 for (k = IA[i]; k < IA[i + 1]; ++k) {
1049 memcpy(diaginv + i * 9, val + m, 9 *
sizeof(
REAL));
1054 printf(
"### DEBUG: row, col = %d\n", i);
1060 for (k = IA[i]; k < IA[i + 1]; ++k) {
1075 stride_i = ROW / nthreads;
1076#pragma omp parallel private(myid, mybegin, myend, i, k, m, j) num_threads(nthreads)
1078 myid = omp_get_thread_num();
1079 mybegin = myid * stride_i;
1080 if (myid < nthreads - 1)
1081 myend = mybegin + stride_i;
1084 for (i = mybegin; i < myend; ++i) {
1086 for (k = IA[i]; k < IA[i + 1]; ++k) {
1089 memcpy(diaginv + i * 25, val + m, 25 *
sizeof(
REAL));
1098 for (k = IA[i]; k < IA[i + 1]; ++k) {
1112 for (i = 0; i < ROW; ++i) {
1114 for (k = IA[i]; k < IA[i + 1]; ++k) {
1117 memcpy(diaginv + i * 25, val + m, 25 *
sizeof(
REAL));
1128 REAL aa[25], bb[25];
1129 for (k = 0; k < 25; k++) bb[k] = diaginv[i * 25 + k];
1130 for (k = 0; k < 25; k++) aa[k] = diaginv[i * 25 + k];
1132 printf(
"### DEBUG: Check inverse matrix...\n");
1133 printf(
"##----------------------------------------------\n");
1134 printf(
"## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1135 bb[0]* aa[0] + bb[1] * aa[5] + bb[2] * aa[10] + bb[3] * aa[15] + bb[4] * aa[20],
1136 bb[0]* aa[1] + bb[1] * aa[6] + bb[2] * aa[11] + bb[3] * aa[16] + bb[4] * aa[21],
1137 bb[0]* aa[2] + bb[1] * aa[7] + bb[2] * aa[12] + bb[3] * aa[17] + bb[4] * aa[22],
1138 bb[0]* aa[3] + bb[1] * aa[8] + bb[2] * aa[13] + bb[3] * aa[18] + bb[4] * aa[23],
1139 bb[0]* aa[4] + bb[1] * aa[9] + bb[3] * aa[14] + bb[3] * aa[19] + bb[4] * aa[24]);
1140 printf(
"## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1141 bb[5]* aa[0] + bb[6] * aa[5] + bb[7] * aa[10] + bb[8] * aa[15] + bb[9] * aa[20],
1142 bb[5]* aa[1] + bb[6] * aa[6] + bb[7] * aa[11] + bb[8] * aa[16] + bb[9] * aa[21],
1143 bb[5]* aa[2] + bb[6] * aa[7] + bb[7] * aa[12] + bb[8] * aa[17] + bb[9] * aa[22],
1144 bb[5]* aa[3] + bb[6] * aa[8] + bb[7] * aa[13] + bb[8] * aa[18] + bb[9] * aa[23],
1145 bb[5]* aa[4] + bb[6] * aa[9] + bb[7] * aa[14] + bb[8] * aa[19] + bb[9] * aa[24]);
1146 printf(
"## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1147 bb[10]* aa[0] + bb[11] * aa[5] + bb[12] * aa[10] + bb[13] * aa[15] + bb[14] * aa[20],
1148 bb[10]* aa[1] + bb[11] * aa[6] + bb[12] * aa[11] + bb[13] * aa[16] + bb[14] * aa[21],
1149 bb[10]* aa[2] + bb[11] * aa[7] + bb[12] * aa[12] + bb[13] * aa[17] + bb[14] * aa[22],
1150 bb[10]* aa[3] + bb[11] * aa[8] + bb[12] * aa[13] + bb[13] * aa[18] + bb[14] * aa[23],
1151 bb[10]* aa[4] + bb[11] * aa[9] + bb[12] * aa[14] + bb[13] * aa[19] + bb[14] * aa[24]);
1152 printf(
"## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1153 bb[15]* aa[0] + bb[16] * aa[5] + bb[17] * aa[10] + bb[18] * aa[15] + bb[19] * aa[20],
1154 bb[15]* aa[1] + bb[16] * aa[6] + bb[17] * aa[11] + bb[18] * aa[16] + bb[19] * aa[21],
1155 bb[15]* aa[2] + bb[16] * aa[7] + bb[17] * aa[12] + bb[18] * aa[17] + bb[19] * aa[22],
1156 bb[15]* aa[3] + bb[16] * aa[8] + bb[17] * aa[13] + bb[18] * aa[18] + bb[19] * aa[23],
1157 bb[15]* aa[4] + bb[16] * aa[9] + bb[17] * aa[14] + bb[18] * aa[19] + bb[19] * aa[24]);
1158 printf(
"## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1159 bb[20]* aa[0] + bb[21] * aa[5] + bb[22] * aa[10] + bb[23] * aa[15] + bb[24] * aa[20],
1160 bb[20]* aa[1] + bb[21] * aa[6] + bb[22] * aa[11] + bb[23] * aa[16] + bb[24] * aa[21],
1161 bb[20]* aa[2] + bb[21] * aa[7] + bb[22] * aa[12] + bb[23] * aa[17] + bb[24] * aa[22],
1162 bb[20]* aa[3] + bb[21] * aa[8] + bb[22] * aa[13] + bb[23] * aa[18] + bb[24] * aa[23],
1163 bb[20]* aa[4] + bb[21] * aa[9] + bb[22] * aa[14] + bb[23] * aa[19] + bb[24] * aa[24]);
1164 printf(
"##----------------------------------------------\n");
1168 for (k = IA[i]; k < IA[i + 1]; ++k) {
1183 stride_i = ROW / nthreads;
1184#pragma omp parallel private(myid, mybegin, myend, i, k, m, j) num_threads(nthreads)
1186 myid = omp_get_thread_num();
1187 mybegin = myid * stride_i;
1188 if (myid < nthreads - 1)
1189 myend = mybegin + stride_i;
1192 for (i = mybegin; i < myend; ++i) {
1194 for (k = IA[i]; k < IA[i + 1]; ++k) {
1197 memcpy(diaginv + i * 49, val + m, 49 *
sizeof(
REAL));
1206 for (k = IA[i]; k < IA[i + 1]; ++k) {
1219 for (i = 0; i < ROW; ++i) {
1221 for (k = IA[i]; k < IA[i + 1]; ++k) {
1224 memcpy(diaginv + i * 49, val + m, 49 *
sizeof(
REAL));
1235 for (k = IA[i]; k < IA[i + 1]; ++k) {
1250 stride_i = ROW / nthreads;
1251#pragma omp parallel private(myid, mybegin, myend, i, k, m, j) num_threads(nthreads)
1253 myid = omp_get_thread_num();
1254 mybegin = myid * stride_i;
1255 if (myid < nthreads - 1)
1256 myend = mybegin + stride_i;
1259 for (i = mybegin; i < myend; ++i) {
1261 for (k = IA[i]; k < IA[i + 1]; ++k) {
1264 memcpy(diaginv + i * nb2, val + m, nb2 *
sizeof(
REAL));
1273 for (k = IA[i]; k < IA[i + 1]; ++k) {
1284 for (i = 0; i < ROW; ++i) {
1286 for (k = IA[i]; k < IA[i + 1]; ++k) {
1289 memcpy(diaginv + i * nb2, val + m, nb2 *
sizeof(
REAL));
1300 for (k = IA[i]; k < IA[i + 1]; ++k) {
1341 const INT nb = A->
nb;
1342 const INT nb2 = nb * nb;
1343 const INT* IA = A->
IA;
1344 const INT* JA = A->
JA;
1357 INT myid, mybegin, myend;
1362 nthreads = fasp_get_num_threads();
1374 memcpy(IAb, IA, (ROW + 1) *
sizeof(
INT));
1379 memcpy(JAb, JA, NNZ *
sizeof(
INT));
1388#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1390 for (myid = 0; myid < nthreads; myid++) {
1392 for (i = mybegin; i < myend; ++i) {
1398 memcpy(diaginv + i * 4, val + m, 4 *
sizeof(
REAL));
1405 for (k = ibegin + 1; k < iend; ++k) {
1412 for (i = 0; i < ROW; ++i) {
1417 memcpy(diaginv + i * 4, val + m, 4 *
sizeof(
REAL));
1424 for (k = ibegin + 1; k < iend; ++k) {
1436#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1438 for (myid = 0; myid < nthreads; myid++) {
1440 for (i = mybegin; i < myend; ++i) {
1446 memcpy(diaginv + i * 9, val + m, 9 *
sizeof(
REAL));
1451 for (k = ibegin + 1; k < iend; ++k) {
1458 for (i = 0; i < ROW; ++i) {
1463 memcpy(diaginv + i * 9, val + m, 9 *
sizeof(
REAL));
1470 for (k = ibegin + 1; k < iend; ++k) {
1482#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1484 for (myid = 0; myid < nthreads; myid++) {
1486 for (i = mybegin; i < myend; ++i) {
1491 memcpy(diaginv + i * 25, val + m, 25 *
sizeof(
REAL));
1498 for (k = ibegin + 1; k < iend; ++k) {
1505 for (i = 0; i < ROW; ++i) {
1510 memcpy(diaginv + i * 25, val + m, 25 *
sizeof(
REAL));
1517 for (k = ibegin + 1; k < iend; ++k) {
1528#pragma omp parallel for private(myid, i, mybegin, myend, ibegin, iend, m, k)
1530 for (myid = 0; myid < nthreads; myid++) {
1532 for (i = mybegin; i < myend; ++i) {
1537 memcpy(diaginv + i * 49, val + m, 49 *
sizeof(
REAL));
1544 for (k = ibegin + 1; k < iend; ++k) {
1551 for (i = 0; i < ROW; ++i) {
1556 memcpy(diaginv + i * 49, val + m, 49 *
sizeof(
REAL));
1563 for (k = ibegin + 1; k < iend; ++k) {
1575#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1577 for (myid = 0; myid < nthreads; myid++) {
1579 for (i = mybegin; i < myend; ++i) {
1584 memcpy(diaginv + i * nb2, val + m, nb2 *
sizeof(
REAL));
1591 for (k = ibegin + 1; k < iend; ++k) {
1599 for (i = 0; i < ROW; ++i) {
1604 memcpy(diaginv + i * nb2, val + m, nb2 *
sizeof(
REAL));
1611 for (k = ibegin + 1; k < iend; ++k) {
1643 const INT nb2 = A->
nb * A->
nb;
1650#pragma omp parallel for private(i, k) if (n > OPENMP_HOLDS)
1652 for (i = 0; i < n; ++i) {
1653 for (k = A->
IA[i]; k < A->IA[i + 1]; ++k) {
1654 if (A->
JA[k] == i) {
1655 memcpy(diag + i * nb2, A->
val + k * nb2, nb2 *
sizeof(
REAL));
1682 const INT ROW_plus_one = ROW + 1;
1685 const INT nb = A->
nb;
1687 const INT* IA = A->
IA;
1688 const INT* JA = A->
JA;
1716 for (i = 0; i < ROW; i++) {
1718 for (j = IA[i]; j < IA[i + 1]; j++) {
1722 temp[0] = val[j * nb2];
1723 temp[1] = val[j * nb2 + 1];
1724 temp[2] = val[j * nb2 + 2];
1725 temp[3] = val[j * nb2 + 3];
1729 DL[i * nb2 + 1] = 0.0;
1730 DL[i * nb2 + 2] = -temp[2] / temp[0];
1731 DL[i * nb2 + 3] = 1.0;
1737 DU[i * nb2 + 1] = -temp[1] / temp[0];
1738 DU[i * nb2 + 2] = 0.0;
1739 DU[i * nb2 + 3] = 1.0;
1753 for (i = 0; i < ROW; i++) {
1755 for (j = IA[i]; j < IA[i + 1]; j++) {
1759 temp[0] = val[j * nb2];
1760 temp[1] = val[j * nb2 + 1];
1761 temp[2] = val[j * nb2 + 2];
1762 temp[3] = val[j * nb2 + 3];
1763 temp[4] = val[j * nb2 + 4];
1764 temp[5] = val[j * nb2 + 5];
1765 temp[6] = val[j * nb2 + 6];
1766 temp[7] = val[j * nb2 + 7];
1767 temp[8] = val[j * nb2 + 8];
1770 REAL s22 = temp[4] - ((temp[1] * temp[3]) / temp[0]);
1771 REAL s23 = temp[5] - ((temp[2] * temp[3]) / temp[0]);
1772 REAL s32 = temp[7] - ((temp[1] * temp[6]) / temp[0]);
1776 DL[i * nb2 + 1] = 0.0;
1777 DL[i * nb2 + 2] = 0.0;
1778 DL[i * nb2 + 3] = -temp[3] / temp[0];
1779 DL[i * nb2 + 4] = 1.0;
1780 DL[i * nb2 + 5] = 0.0;
1782 -temp[6] / temp[0] + (temp[3] / temp[0]) * (s32 / s22);
1783 DL[i * nb2 + 7] = -s32 / s22;
1784 DL[i * nb2 + 8] = 1.0;
1788 DU[i * nb2 + 1] = -temp[1] / temp[0];
1790 -temp[2] / temp[0] + (temp[1] / temp[0]) * (s23 / s22);
1791 DU[i * nb2 + 3] = 0.0;
1792 DU[i * nb2 + 4] = 1.0;
1793 DU[i * nb2 + 5] = -s23 / s22;
1794 DU[i * nb2 + 6] = 0.0;
1795 DU[i * nb2 + 7] = 0.0;
1796 DU[i * nb2 + 8] = 1.0;
1809 printf(
"### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
1819 for (i = 0; i < ROW; i++) {
1821 for (j = IA[i]; j < IA[i + 1]; j++) {
1834 valb[j * nb2 + 1] = 0.0;
1835 valb[j * nb2 + 2] = 0.0;
1847 for (i = 0; i < ROW; i++) {
1849 for (j = IA[i]; j < IA[i + 1]; j++) {
1862 valb[j * nb2 + 1] = 0.0;
1863 valb[j * nb2 + 2] = 0.0;
1864 valb[j * nb2 + 3] = 0.0;
1865 valb[j * nb2 + 5] = 0.0;
1866 valb[j * nb2 + 6] = 0.0;
1867 valb[j * nb2 + 7] = 0.0;
1882 printf(
"### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
1910 INT ROW_plus_one = ROW + 1;
1925 REAL sqt3, sqt4, sqt8;
1942 for (i = 0; i < ROW; i++) {
1943 for (j = IA[i]; j < IA[i + 1]; j++) {
1945 REAL temp0 = val[j * nb2];
1946 REAL temp1 = val[j * nb2 + 1];
1947 REAL temp2 = val[j * nb2 + 2];
1948 REAL temp3 = val[j * nb2 + 3];
1952 sqt3 = sqrt(
ABS(temp3));
1956 DL[i * nb2 + 1] = 0.0;
1957 DL[i * nb2 + 2] = -temp2 / temp0 / sqt3;
1958 DL[i * nb2 + 3] = 1.0 / sqt3;
1962 DU[i * nb2 + 1] = -temp1 / temp0 / sqt3;
1963 DU[i * nb2 + 2] = 0.0;
1964 DU[i * nb2 + 3] = 1.0 / sqt3;
1973 for (i = 0; i < ROW; i++) {
1974 for (j = IA[i]; j < IA[i + 1]; j++) {
1976 REAL temp0 = val[j * nb2];
1977 REAL temp1 = val[j * nb2 + 1];
1978 REAL temp2 = val[j * nb2 + 2];
1979 REAL temp3 = val[j * nb2 + 3];
1980 REAL temp4 = val[j * nb2 + 4];
1981 REAL temp5 = val[j * nb2 + 5];
1982 REAL temp6 = val[j * nb2 + 6];
1983 REAL temp7 = val[j * nb2 + 7];
1984 REAL temp8 = val[j * nb2 + 8];
1989 sqt4 = sqrt(
ABS(temp4));
1990 sqt8 = sqrt(
ABS(temp8));
1993 REAL s22 = temp4 - ((temp1 * temp3) / temp0);
1994 REAL s23 = temp5 - ((temp2 * temp3) / temp0);
1995 REAL s32 = temp7 - ((temp1 * temp6) / temp0);
1999 DL[i * nb2 + 1] = 0.0;
2000 DL[i * nb2 + 2] = 0.0;
2001 DL[i * nb2 + 3] = -temp3 / temp0 / sqt4;
2002 DL[i * nb2 + 4] = 1.0 / sqt4;
2003 DL[i * nb2 + 5] = 0.0;
2005 (-temp6 / temp0 + (temp3 / temp0) * (s32 / s22)) / sqt8;
2006 DL[i * nb2 + 7] = -s32 / s22 / sqt8;
2007 DL[i * nb2 + 8] = 1.0 / sqt8;
2011 DU[i * nb2 + 1] = -temp1 / temp0 / sqt4;
2013 (-temp2 / temp0 + (temp1 / temp0) * (s23 / s22)) / sqt8;
2014 DU[i * nb2 + 3] = 0.0;
2015 DU[i * nb2 + 4] = 1.0 / sqt4;
2016 DU[i * nb2 + 5] = -s23 / s22 / sqt8;
2017 DU[i * nb2 + 6] = 0.0;
2018 DU[i * nb2 + 7] = 0.0;
2019 DU[i * nb2 + 8] = 1.0 / sqt8;
2029 printf(
"### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
2038 for (i = 0; i < ROW; i++) {
2039 for (j = IA[i]; j < IA[i + 1]; j++) {
2047 valb[j * nb2 + 1] = 0.0;
2048 valb[j * nb2 + 2] = 0.0;
2058 for (i = 0; i < ROW; i++) {
2059 for (j = IA[i]; j < IA[i + 1]; j++) {
2067 valb[j * nb2 + 1] = 0.0;
2068 valb[j * nb2 + 2] = 0.0;
2069 valb[j * nb2 + 3] = 0.0;
2070 valb[j * nb2 + 5] = 0.0;
2071 valb[j * nb2 + 6] = 0.0;
2072 valb[j * nb2 + 7] = 0.0;
2083 printf(
"### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
2109 const INT * ia = A->
IA, *ja = A->
JA;
2111 const INT nb = A->
nb, nb2 = nb * nb;
2115 INT i, j, k, jaj, i1, i2, start, jj;
2120 nthreads = fasp_get_num_threads();
2130 INT myid, mybegin, myend;
2132#pragma omp parallel for private(myid, mybegin, myend, i)
2134 for (myid = 0; myid < nthreads; ++myid) {
2136 for (i = mybegin; i < myend; ++i) Pt[P[i]] = i;
2139 for (i = 0; i < n; ++i) Pt[P[i]] = i;
2144 for (i = 0; i < n; ++i) {
2146 Aperm.
IA[i + 1] = Aperm.
IA[i] + (ia[k + 1] - ia[k]);
2151 INT myid, mybegin, myend;
2153#pragma omp parallel for private(myid, mybegin, myend, i, i1, i2, k, start, j, jaj, jj)
2155 for (myid = 0; myid < nthreads; ++myid) {
2157 for (i = mybegin; i < myend; ++i) {
2159 i2 = Aperm.
IA[i + 1] - 1;
2162 for (j = i1; j <= i2; ++j) {
2163 jaj = start + j - i1;
2164 Aperm.
JA[j] = ja[jaj];
2165 for (jj = 0; jj < nb2; ++jj)
2166 Aperm.
val[j * nb2 + jj] = Aval[jaj * nb2 + jj];
2171 for (i = 0; i < n; ++i) {
2173 i2 = Aperm.
IA[i + 1] - 1;
2176 for (j = i1; j <= i2; ++j) {
2177 jaj = start + j - i1;
2178 Aperm.
JA[j] = ja[jaj];
2179 for (jj = 0; jj < nb2; ++jj)
2180 Aperm.
val[j * nb2 + jj] = Aval[jaj * nb2 + jj];
2187 INT myid, mybegin, myend;
2189#pragma omp parallel for private(myid, mybegin, myend, k, j)
2191 for (myid = 0; myid < nthreads; ++myid) {
2193 for (k = mybegin; k < myend; ++k) {
2195 Aperm.
JA[k] = Pt[j];
2199 for (k = 0; k < nnz; ++k) {
2201 Aperm.
JA[k] = Pt[j];
2226 const INT num_rowsA = A->
ROW;
2227 const INT nb = A->
nb;
2228 const INT nb2 = nb * nb;
2233 INT i, ii, j, jj, ibegin, iend, iend1;
2237 INT myid, mybegin, myend;
2238 INT nthreads = fasp_get_num_threads();
2243#pragma omp parallel for private(myid, mybegin, myend, i, ii, j, jj, ibegin, iend, \
2245 for (myid = 0; myid < nthreads; myid++) {
2247 for (i = mybegin; i < myend; i++) {
2251 for (j = ibegin; j < iend1; j++) {
2253 for (jj = j + 1; jj < iend; jj++) {
2254 if (A_j[j] == A_j[jj]) {
2256 for (ii = 0; ii < nb2; ii++)
2257 A_data[j * nb2 + ii] += A_data[jj * nb2 + ii];
2268 for (i = 0; i < num_rowsA; i++) {
2272 for (j = ibegin; j < iend1; j++) {
2274 for (jj = j + 1; jj < iend; jj++) {
2275 if (A_j[j] == A_j[jj]) {
2277 for (ii = 0; ii < nb2; ii++)
2278 A_data[j * nb2 + ii] += A_data[jj * nb2 + ii];
2279 printf(
"### WARNING: Same col indices at %d, col %d (%d "
2295 memcpy(tempA_i, A_i, (num_rowsA + 1) *
sizeof(
INT));
2298 for (i = 0; i < num_rowsA; i++) {
2299 ibegin = tempA_i[i];
2300 iend = tempA_i[i + 1];
2301 for (j = ibegin; j < iend; j++) {
2303 memcpy(A_data + jj * nb2, A_data + j * nb2, (nb2) *
sizeof(
REAL));
2314 printf(
"### WARNING: %d col indices have been merged!\n", count);
2337void dBSRmat_Multicoloring_Theta(
2340 INT k, i, j, pre, group;
2341 INT igold, iend, iavg;
2345 INT *IC, *ICMAP, color;
2349 if (theta > 0 && theta < 1.0) {
2350 generate_S_theta_bsr(A, &S, theta);
2353 }
else if (theta == 1.0) {
2355 IC = (
INT*)malloc(
sizeof(
INT) * 2);
2356 ICMAP = (
INT*)malloc(
sizeof(
INT) * (n + 1));
2361#pragma omp parallel for private(k)
2363 for (k = 0; k < n; k++) ICMAP[k] = k;
2368 printf(
"%s Theta = %lf \n", __FUNCTION__, theta);
2377 INT* cq = (
INT*)malloc(
sizeof(
INT) * (n + 1));
2378 INT* newr = (
INT*)malloc(
sizeof(
INT) * (n + 1));
2381#pragma omp parallel for private(k)
2383 for (k = 0; k < n; k++) {
2387 for (k = 0; k < n; k++) {
2388 if ((A->
IA[k + 1] - A->
IA[k]) > group) group = A->
IA[k + 1] - A->
IA[k];
2394 igold = (
INT)
MAX(iavg,group*0.618) +1;
2398 IC = (
INT*)malloc(
sizeof(
INT) * (group + 2));
2399 ICMAP = (
INT*)malloc(
sizeof(
INT) * (n + 1));
2404 memset(newr, -1,
sizeof(
INT) * (n + 1));
2405 memset(ICMAP, 0,
sizeof(
INT) * n);
2415 if (front == n) front = 0;
2424 if ((IA[i+1]-IA[i]) > igold)
2425 iend =
MIN(IA[i+1], (IA[i] + igold));
2429 for (j = IA[i]; j < iend; j++) newr[JA[j]] = group;
2430 }
else if (newr[i] == group) {
2433 if (rear == n) rear = 0;
2439 if ((IA[i+1] - IA[i]) > igold) iend =
MIN(IA[i+1], (IA[i] + igold));
2443 for (j = IA[i]; j < iend; j++) newr[JA[j]] = group;
2448 }
while (rear != front);
2455 for (i = 0; i < color; i++) {
2456 for (j = IC[i]; j < IC[i + 1]; j++)
2457 printf(
"color %d ICMAP[%d] = %d \n", i, j, ICMAP[j]);
2458 printf(
"color = %d row= %d %d\n", color, n, IC[i + 1] - IC[i]);
2484static double dense_matrix_max_internal(
double* A,
int m,
int n)
2486 double norm = 0.0, sum;
2488 for (i = 0; i < m; i++) {
2489 for (j = 0; j < n; j++) {
2490 norm =
MAX(norm, fabs(A[i * m + j]));
2499 const INT row_plus_one = row + 1;
2500 const INT nnz = A->
IA[row] - A->
IA[0];
2501 const INT nb = A->
nb, nb2 = nb * nb;
2503 INT index, i, j, begin_row, end_row;
2504 INT * ia = A->
IA, *ja = A->
JA;
2527#pragma omp parallel for private(i, j, begin_row, end_row, row_abs_sum)
2529 for (i = 0; i < row; ++i) {
2533 end_row = ia[i + 1];
2534 for (j = begin_row; j < end_row; j++) {
2536 row_abs_sum += dense_matrix_max_internal(aj + j * nb2, nb, nb);
2538 row_abs_sum = row_abs_sum * theta;
2541 for (j = begin_row; j < end_row; j++) {
2543 if ((row_abs_sum >= dense_matrix_max_internal(aj + j * nb2, nb, nb)) &&
2552 for (i = 0; i < row; ++i) {
2555 end_row = ia[i + 1] - 1;
2556 for (j = begin_row; j <= end_row; j++) {
2557 if (S->
JA[j] > -1) {
2558 S->
JA[index] = S->
JA[j];
void fasp_iarray_set(const INT n, INT *x, const INT val)
Set initial value for an array to be x=val.
void fasp_iarray_cp(const INT n, const INT *x, INT *y)
Copy an array to the other y=x.
void fasp_mem_free(void *mem)
Free up previous allocated memory body and set pointer to NULL.
void * fasp_mem_realloc(void *oldmem, const LONGLONG tsize)
Reallocate, initiate, and check memory.
void * fasp_mem_calloc(const unsigned int size, const unsigned int type)
Allocate, initiate, and check memory.
void fasp_get_start_end(const INT procid, const INT nprocs, const INT n, INT *start, INT *end)
Assign Load to each thread.
void fasp_smat_inv_nc3(REAL *a)
Compute the inverse matrix of a 3*3 full matrix A (in place)
void fasp_smat_identity_nc7(REAL *a)
Set a 7*7 full matrix to be a identity.
void fasp_smat_identity_nc3(REAL *a)
Set a 3*3 full matrix to be a identity.
SHORT fasp_smat_invp_nc(REAL *a, const INT n)
Compute the inverse of a matrix using Gauss Elimination with Pivoting.
void fasp_smat_inv_nc7(REAL *a)
Compute the inverse matrix of a 7*7 matrix a.
void fasp_smat_inv_nc2(REAL *a)
Compute the inverse matrix of a 2*2 full matrix A (in place)
SHORT fasp_smat_inv(REAL *a, const INT n)
Compute the inverse matrix of a small full matrix a.
void fasp_smat_identity_nc2(REAL *a)
Set a 2*2 full matrix to be a identity.
void fasp_smat_inv_nc5(REAL *a)
Compute the inverse matrix of a 5*5 full matrix A (in place)
void fasp_smat_identity_nc5(REAL *a)
Set a 5*5 full matrix to be a identity.
void fasp_smat_identity(REAL *a, const INT n, const INT n2)
Set a n*n full matrix to be a identity.
void fasp_blas_smat_mul(const REAL *a, const REAL *b, REAL *c, const INT n)
Compute the matrix product of two small full matrices a and b, stored in c.
void fasp_blas_smat_mul_nc2(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 2* matrices a and b, stored in c.
void fasp_blas_smat_mul_nc3(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 3*3 matrices a and b, stored in c.
void fasp_blas_smat_mul_nc7(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 7*7 matrices a and b, stored in c.
void fasp_blas_smat_mul_nc5(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 5*5 matrices a and b, stored in c.
dBSRmat fasp_dbsr_create(const INT ROW, const INT COL, const INT NNZ, const INT nb, const INT storage_manner)
Create BSR sparse matrix data memory space.
dBSRmat fasp_dbsr_diaginv(const dBSRmat *A)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
INT fasp_dbsr_trans(const dBSRmat *A, dBSRmat *AT)
Find A^T from given dBSRmat matrix A.
SHORT fasp_dbsr_getblk(const dBSRmat *A, const INT *Is, const INT *Js, const INT m, const INT n, dBSRmat *B)
Get a sub BSR matrix of A with specified rows and columns.
void fasp_dbsr_getdiag(INT n, const dBSRmat *A, REAL *diag)
Abstract the diagonal blocks of a BSR matrix.
dBSRmat fasp_dbsr_diagLU(const dBSRmat *A, REAL *DL, REAL *DU)
Compute B := DL*A*DU. We decompose each diagonal block of A into LDU form and DL = diag(L^{-1}) and D...
void fasp_dbsr_cp(const dBSRmat *A, dBSRmat *B)
copy a dCSRmat to a new one B=A
dBSRmat fasp_dbsr_diagLU2(dBSRmat *A, REAL *DL, REAL *DU)
Compute B := DL*A*DU. We decompose each diagonal block of A into LDU form and DL = diag(L^{-1}) and D...
dBSRmat fasp_dbsr_perm(const dBSRmat *A, const INT *P)
Apply permutation of A, i.e. Aperm=PAP' by the orders given in P.
void fasp_dbsr_alloc(const INT ROW, const INT COL, const INT NNZ, const INT nb, const INT storage_manner, dBSRmat *A)
Allocate memory space for BSR format sparse matrix.
void fasp_dbsr_free(dBSRmat *A)
Free memory space for BSR format sparse matrix.
SHORT fasp_dbsr_compress_inplace(dBSRmat *A, const REAL dtol)
Compress a BSR matrix A IN PLACE by dropping small entries max(|Aij|) <= dtol.
dBSRmat fasp_dbsr_diaginv2(const dBSRmat *A, REAL *diaginv)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
SHORT fasp_dbsr_diagpref(dBSRmat *A)
Reorder the column and data arrays of a square BSR matrix, so that the first entry in each row is the...
dBSRmat fasp_dbsr_diaginv4(const dBSRmat *A, REAL *diaginv)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
INT fasp_dbsr_merge_col(dBSRmat *A)
Check and merge some same col index in one row.
dBSRmat fasp_dbsr_diaginv3(const dBSRmat *A, REAL *diaginv)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
dvector fasp_dbsr_getdiaginv(const dBSRmat *A)
Get D^{-1} of matrix A.
Main header file for the FASP project.
#define SHORT
FASP integer and floating point numbers.
#define MAX(a, b)
Definition of max, min, abs.
#define FASP_SUCCESS
Definition of return status and error messages.
#define TRUE
Definition of logic type.
Block sparse row storage matrix of REAL type.
INT COL
number of cols of sub-blocks in matrix A, N
INT NNZ
number of nonzero sub-blocks in matrix A, NNZ
INT nb
dimension of each sub-block
INT * IA
integer array of row pointers, the size is ROW+1
INT ROW
number of rows of sub-blocks in matrix A, M
INT storage_manner
storage manner for each sub-block
Vector with n entries of REAL type.
REAL * val
actual vector entries
Sparse matrix of INT type in CSR format.
INT col
column of matrix A, n
INT row
row number of matrix A, m
INT * IA
integer array of row pointers, the size is m+1
INT nnz
number of nonzero entries
INT * JA
integer array of column indexes, the size is nnz
INT * val
nonzero entries of A