This submission is migrated from previous version of oj.uz, which used different machine for grading. This submission may have different result if resubmitted.
#include <bits/stdc++.h>
using namespace std;
using ll = long long;
//O(min(H, W) * H * W)
//I also came up with O((H * W)^2) solution, but didn't know how to optimize it to O(min(H, W) * H * W)
//so now this is Radewoosh's solution
int main() {
    ios::sync_with_stdio(false);
    cin.tie(nullptr);
    int n, m;
    cin >> n >> m;
    bool swapped = false;
    if (n > m) {
        swapped = true;
        swap(n, m);
    }
    vector<vector<int>> a(n, vector<int>(m));
    vector<int> yy;
    for (int i = 0; i < (swapped ? m : n); ++i) {
        for (int j = 0; j < (swapped ? n : m); ++j) {
            if (!swapped) {
                cin >> a[i][j];
                yy.push_back(a[i][j]);
            } else {
                cin >> a[j][i];
                yy.push_back(a[j][i]);
            }
        }
    }
    sort(yy.begin(), yy.end());
    yy.resize(unique(yy.begin(), yy.end()) - yy.begin());
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < m; ++j) {
            a[i][j] = lower_bound(yy.begin(), yy.end(), a[i][j]) - yy.begin() + 1;
        }
    }
    const int dx[4] = {-1, 0, 1, 0};
    const int dy[4] = {0, 1, 0, -1};
    //  0
    // 3 1
    //  2
    auto solveM = [&](int k) {
        int ans = 0;
        for (int i = 0; i < m;) {
            int j = i + 1;
            while (j < m && a[k][j] < a[k][j - 1]) {
                j += 1;
            }
            int len = j - i;
            ans += len * ((ll) len - 1) / 2;
            i = j;
        }
        for (int i = m - 1; i > -1;) {
            int j = i - 1;
            while (j > -1 && a[k][j] < a[k][j + 1]) {
                j -= 1;
            }
            int len = i - j;
            ans += len * ((ll) len - 1) / 2;
            i = j;
        }
        return ans;
    };
    auto solveN = [&](int k) {
        int ans = 0;
        for (int i = 0; i < n;) {
            int j = i + 1;
            while (j < n && a[j][k] < a[j - 1][k]) {
                j += 1;
            }
            int len = j - i;
            ans += len * ((ll) len - 1) / 2;
            i = j;
        }
        for (int i = n - 1; i > -1;) {
            int j = i - 1;
            while (j > -1 && a[j][k] < a[j + 1][k]) {
                j -= 1;
            }
            int len = i - j;
            ans += len * ((ll) len - 1) / 2;
            i = j;
        }
        return ans;
    };
    int ans = 0;
    if (n == 1) {
        ans += solveM(0) + m;
    } else {
        int diff[16][n][m];
        for (int i = 0; i < n; ++i) {
            for (int j = 0; j < m; ++j) {
                for (int mask = 0; mask < 16; ++mask) {
                    int lower = 0;
                    bool big = false;
                    for (int k = 0; k < 4; ++k) {
                        if (mask >> k & 1) {
                            int nx = i + dx[k];
                            int ny = j + dy[k];
                            if (nx >= 0 && nx < n && ny >= 0 && ny < m) {
                                if (a[nx][ny] < a[i][j]) {
                                    lower = max(lower, a[nx][ny]);
                                } else {
                                    big = true;
                                }
                            }
                        }
                    }
                    diff[mask][i][j] = a[i][j] - lower + (big ? 0 : n * m + 1 - a[i][j]);
                }
            }
        }
        //now handle rectangles with width or length = 1
        for (int i = 0; i < n; ++i) {
            ans += solveM(i);
        }
        for (int i = 0; i < m; ++i) {
            ans += solveN(i);
        }
        ans += n * m;
        vector<vector<int>> mid(n, vector<int>(m)), L(n, vector<int>(m)), R(n, vector<int>(m));
        for (int y = 0; y < m; ++y) {
            for (int x = 0; x < n; ++x) {
                mid[x][y] = (x ? mid[x - 1][y] : 0) + diff[15][x][y];
                L[x][y] = (x ? L[x - 1][y] : 0) + diff[7][x][y];
                R[x][y] = (x ? R[x - 1][y] : 0) + diff[13][x][y];
            }
        }
        //main part!
        for (int x1 = 0; x1 < n; ++x1) {
            for (int x2 = x1 + 1; x2 < n; ++x2) {
                vector<int> valMid(m), valR(m), valL(m);
                for (int y = 0; y < m; ++y) {
                    valMid[y] = mid[x2 - 1][y] - mid[x1][y] + diff[14][x1][y] + diff[11][x2][y];
                    valR[y] = R[x2 - 1][y] - R[x1][y] + diff[12][x1][y] + diff[9][x2][y];
                    valL[y] = L[x2 - 1][y] - L[x1][y] + diff[6][x1][y] + diff[3][x2][y];
                }
                unordered_map<int, int> cnt;
                int prefix = 0;
                for (int y1 = 0; y1 < m; ++y1) {
                    if (y1 > 0) {
                        prefix += valMid[y1 - 1];
                        ++cnt[prefix - valL[y1 - 1]];
                    }
                    ans += cnt[prefix + valR[y1] - (n * m + 1)];
                }
            }
        }
    }
    cout << ans;
    return 0;
}
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... |