This submission is migrated from previous version of oj.uz, which used different machine for grading. This submission may have different result if resubmitted.
#include<bits/stdc++.h>
using namespace std;
#pragma GCC target ("avx2")
#pragma GCC optimization ("O3")
#pragma GCC optimization ("unroll-loops")
int h,w;
vector< vector<int> > v;
vector< vector<int> > psum[16];
vector< pair< int, pair<int,int> > > lst; 
int mxv;
int middle[500005], first[500005], last[500005];
map<int, int> mp;
int dx[4] = {0, 0, 1, -1};
int dy[4] = {1, -1, 0, 0};
bool ok(int x, int y) {
	return (x>=1&&x<=h&&y>=1&&y<=w);
}
int find(int id, int X1, int X2, int Y1, int Y2) {
	if(X1>X2 || Y1>Y2) return 0;
	return psum[id][X2][Y2] - psum[id][X1-1][Y2] - psum[id][X2][Y1-1] + psum[id][X1-1][Y1-1];
}
int main() {
	cin >> h >> w;
	if(h>w) {
		swap(h,w);
		v.resize(h+1);
		for(int i=0; i<=h; i++) v[i].resize(w+1);
		for(int i=0; i<16; i++) {
			psum[i].resize(h+1);
			for(int j=0; j<=h; j++) psum[i][j].resize(w+1);
		}
		for(int i=1; i<=w; i++) for(int j=1; j<=h; j++) {
			cin >> v[j][i];
			lst.push_back({v[j][i], {j, i} });
		}
	} else {
		v.resize(h+1);
		for(int i=0; i<=h; i++) v[i].resize(w+1);
		for(int i=0; i<16; i++) {
			psum[i].resize(h+1);
			for(int j=0; j<=h; j++) psum[i][j].resize(w+1);
		}
		for(int i=1; i<=h; i++) for(int j=1; j<=w; j++) {
			cin >> v[i][j];
			lst.push_back({v[i][j], {i, j} });
		}
	}
	sort(lst.begin(), lst.end());
	for(int i=0; i<lst.size(); i++) {
		v[lst[i].second.first][lst[i].second.second] = i+1;
	}
	for(int i=1; i<=h; i++) {
		for(int j=1; j<=w; j++) {
			for(int k=0; k<16; k++) {
				int lower = 0, mx = 1;
				for(int l=0; l<4; l++) {
					if((k>>l)%2) {
						if(ok(i+dx[l], j+dy[l]) && v[i+dx[l]][j+dy[l]]<v[i][j]) {
							lower = max(lower, v[i+dx[l]][j+dy[l]]);
						} else mx=0;
					}
				}
				if(mx == 1) psum[k][i][j] = h*w+1;
				else psum[k][i][j] = v[i][j];
				psum[k][i][j] -= lower;
			}
		}
	}
	for(int k=0; k<16; k++) {
		for(int i=1; i<=h; i++) {
			for(int j=1; j<=w; j++) {
				psum[k][i][j] += psum[k][i][j-1] + psum[k][i-1][j] - psum[k][i-1][j-1];
			}
		}
	}
	long long ans = 0;
	for(int X1=1; X1<=h; X1++) {
		for(int X2=X1+1; X2<=h; X2++) {
			for(int Y1=1; Y1<=w; Y1++) {
				first[Y1] = find(5, X1, X1, Y1, Y1) + find(13, X1+1, X2-1, Y1, Y1) + find(9, X2, X2, Y1, Y1);
				middle[Y1] = find(7, X1, X1, Y1, Y1) + find(15, X1+1, X2-1, Y1, Y1) + find(11, X2, X2, Y1, Y1);
				middle[Y1] += middle[Y1-1];
				last[Y1] = find(6, X1, X1, Y1, Y1) + find(14, X1+1, X2-1, Y1, Y1) + find(10, X2, X2, Y1, Y1);
			}
			mp.clear();
			for(int Y1=w; Y1>=1; Y1--) {
				ans += mp[h*w+1-first[Y1]+middle[Y1]];
				mp[middle[Y1-1] + last[Y1]]++;
			}
		}
	}
	for(int j=1; j<=h; j++) {
		bool rel; int prev = 1;
		for(int i=2; i<=w; i++) {
			bool curr;
			if(v[j][i] > v[j][i-1]) {
				curr = false;
			} else {
				curr = true;
			}
			if(i==2) rel = curr;
			if(curr != rel) {
				long long size = i-1-max(1, prev-1) + 1;
				ans += size*(size+1)/2 - 1;
				rel = curr;
				prev = i;
			}
		}
		long long size = w-max(1, prev-1)+1;
		ans += size*(size+1)/2;
	}
	for(int j=1; j<=w; j++) {
		bool rel; int prev = 1;
		for(int i=2; i<=h; i++) {
			bool curr;
			if(v[i][j] > v[i-1][j]) {
				curr = false;
			} else {
				curr = true;
			}
			if(i==2) rel = curr;
			if(curr != rel) {
				long long size = i-1-max(1, prev-1) + 1;
				ans += size*(size+1)/2 - 1;
				rel = curr;
				prev = i;
			}
		}
		long long size = h-max(1, prev-1)+1;
		ans += size*(size+1)/2;
	}
	cout << ans-h*w << "\n"; 
}
Compilation message (stderr)
Main.cpp:5: warning: ignoring '#pragma GCC optimization' [-Wunknown-pragmas]
    5 | #pragma GCC optimization ("O3")
      | 
Main.cpp:6: warning: ignoring '#pragma GCC optimization' [-Wunknown-pragmas]
    6 | #pragma GCC optimization ("unroll-loops")
      | 
Main.cpp: In function 'int main()':
Main.cpp:55:16: warning: comparison of integer expressions of different signedness: 'int' and 'std::vector<std::pair<int, std::pair<int, int> > >::size_type' {aka 'long unsigned int'} [-Wsign-compare]
   55 |  for(int i=0; i<lst.size(); i++) {
      |               ~^~~~~~~~~~~| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... |