This submission is migrated from previous version of oj.uz, which used different machine for grading. This submission may have different result if resubmitted.
#include <iostream>
#include <algorithm>
#include <utility>
#include <vector>
#include <stack>
#include <map>
#include <queue>
#include <set>
#include <unordered_set>
#include <unordered_map>
#include <cstring>
#include <cmath>
#include <functional>
#include <cassert>
#include <iomanip>
#include <numeric>
#include <bitset>
#include <sstream>
#include <chrono>
#include <random>
#define ff first
#define ss second
#define PB push_back
#define sz(x) int(x.size())
#define rsz resize
#define fch(xxx, yyy) for (auto xxx : yyy) // abusive notation
#define all(x) (x).begin(),(x).end()
#define eps 1e-9
// more abusive notation (use at your own risk):
// #define int ll
using namespace std;
typedef long long ll;
typedef long double ld;
typedef unsigned long long ull;
using pii = pair<int, int>;
using pll = pair<ll, ll>;
using vi = vector<int>;
using vll = vector<ll>;
// debugging
void __print(int x) {std::cerr << x;}
void __print(ll x) {std::cerr << x;} /* remember to uncomment this when not using THE MACRO */
void __print(unsigned x) {std::cerr << x;}
void __print(ull x) {std::cerr << x;}
void __print(float x) {std::cerr << x;}
void __print(double x) {std::cerr << x;}
void __print(ld x) {std::cerr << x;}
void __print(char x) {std::cerr << '\'' << x << '\'';}
void __print(const char *x) {std::cerr << '\"' << x << '\"';}
void __print(const string& x) {std::cerr << '\"' << x << '\"';}
void __print(bool x) {cerr << (x ? "true" : "false");}
template<typename T, typename V> void __print(const pair<T, V> &x) {std::cerr << '{'; __print(x.ff); std::cerr << ", "; __print(x.ss); std::cerr << '}';}
template<typename T> void __print(const T& x) {int f = 0; std::cerr << '{'; for (auto &i: x) std::cerr << (f++ ? ", " : ""), __print(i); std::cerr << "}";}
void _print() {std::cerr << "]\n";}
template <typename T, typename... V> void _print(T t, V... v) {__print(t); if (sizeof...(v)) std::cerr << ", "; _print(v...);}
void println() {std::cerr << ">--------------------<" << endl;}
#ifndef ONLINE_JUDGE
#define debug(x...) cerr << "[" << #x << "] = ["; _print(x)
#else
#define debug(x...)
#endif
// templates
template <class T> bool ckmin(T &a, const T &b) {return b<a ? a = b, 1 : 0;}
template <class T> bool ckmax(T &a, const T &b) {return b>a ? a = b, 1 : 0;}
template <class T> using gr = greater<T>;
template <class T> using vc = vector<T>;
template <class T> using p_q = priority_queue<T>;
template <class T> using pqg = priority_queue<T, vc<T>, gr<T>>;
template <class T1, class T2> using pr = pair<T1, T2>;
mt19937_64 rng_ll(chrono::steady_clock::now().time_since_epoch().count());
int rng(int M) {return (int)(rng_ll()%M);} /*returns any random number in [0, M) */
// const variables
constexpr int INF = (int)2e9;
constexpr int MOD = 998244353;
constexpr long double EPS = (ld)1e-10, PI = 3.1415926535;
constexpr ll LL_INF = (ll)3e18;
constexpr int mod = (int)1e9 + 7;
constexpr ll inverse = 500000004LL; // inverse of 2 modulo 1e9 + 7
void setIO(const string& str) {// fast input/output
    ios_base::sync_with_stdio(false);
    cin.tie(nullptr);
    if (str.empty()) return;
    freopen((str + ".in").c_str(), "r", stdin);
    freopen((str + ".out").c_str(), "w", stdout);
}
namespace bitm {
    // bit manipulations
    ull val_MSB(ull i) {
        i |= (i >> 1);
        i |= (i >> 2);
        i |= (i >> 4);
        i |= (i >> 8);
        i |= (i >> 16);
        return i - ((ull) i >> 1);
    }
    int get_MSB(int x) { // assuming x != 0
        if (x == 0) return -1;
        return 31 - __builtin_clz(x);
    }
    int get_MSB(ull x) { // assuming x != 0
        if (x == 0) return -1;
        return 63 - __builtin_clz(x);
    }
    string t_str(ull x) {
        string ret;
        int msb;
        if (x > (ull)2e9) msb = get_MSB(x);
        else msb = get_MSB((int)x);
        bitset<63> b(x);
        for (int i = 0; i <= msb; ++i) {
            if (b[i]) ret.push_back('1');
            else ret.push_back('0');
        }
        reverse(ret.begin(), ret.end());
        return ret;
    }
    int countbits(int x) {
        return __builtin_popcount(x);
    }
    int countbits(ull x) {
        return __builtin_popcount(x);
    }
    // Things to Remember:
    // MSB(10^9) = 30
    // MSB(10^18) = 63
    // (1 << 30) is the largest power of 2 that you can compute using <<'s
}
struct d { // d --> a struct that defines a domino
    // (x, y) is the origin point
    // dir is the direction the domino is in, relative to the center (x, y)
    int x, y, dir;
    int sum; // used to sort the dominoes by sum
    bool operator < (const d& other) const {
        return sum < other.sum;
    }
    bool operator > (const d& other) const {
        return sum > other.sum;
    }
};
const int MAXB = 25; // maximum value of X/3
const int MAXX = 55; // maximum value of X
const int MAXN = int(2000) + 5; // max value of N
int N, K, X; // X is the smallest number to define a solution
// [0, floor(X/3)) --> left half (DP)
// [floor(X/3), X] --> right half
int grid[MAXN][MAXN]; // the grid values
int dp[(1 << MAXB)]; // the DP on bits we are constructing
ll adj[MAXB]; // if they are adjacent, they can be placed together without interference (must belong to the same half being considered)
ll cross[MAXX]; // the pairs that can be placed together between the left and right halves
ll ANS = 0, SUM = 0;
vc<d> s;
int check(d a, d b) { // whether or not dominoes a and b can coexist without overlapping with each other
    vi aa = {a.x * N + a.y, (a.dir == 1 ? a.x + 1 : a.x) * N + (a.dir == 0 ? a.y + 1 : a.y)};
    vi bb = {b.x * N + b.y, (b.dir == 1 ? b.x + 1 : b.x) * N + (b.dir == 0 ? b.y + 1 : b.y)};
    for (int i = 0; i < 2; ++i) {
        for (int j = 0; j < 2; ++j) {
            if (aa[i] == bb[j]) {
                return 0;
            }
        }
    }
    return 1;
}
signed main() { // TIME YOURSELF !!!
    setIO("");
    cin >> N >> K;
    X = 7 * (K - 1) + 1; // the maximum number of dominoes needed for us to find the solution
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            cin >> grid[i][j]; // getting in the input
            SUM += grid[i][j];
        }
    }
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            if (i < N - 1) {
                s.PB({i, j, 1, grid[i][j] + grid[i+1][j]});
            }
            if (j < N - 1) {
                s.PB({i, j, 0, grid[i][j] + grid[i][j+1]});
            }
        }
    }
    X = min(X, sz(s));
    sort(all(s), gr<d>());
//    for (int i = 0; i < X; i++) {
//        debug(s[i].sum, s[i].x, s[i].y, s[i].dir);
//    }
    // initializing the graph
    int B = X/3; // this is at maximum 20
    for (int i = 0; i < B; ++i) {
        for (int j = 0; j < B; ++j) {
            if (j == i) continue;
            if (check(s[i], s[j])) {
                adj[i] |= (1LL << j);
            }
        }
    }
    for (int i = B; i < X; ++i) {
        for (int j = 0; j < B; ++j) {
            if (check(s[i], s[j])) {
                cross[i] |= (1LL << j);
            }
        }
        for (int j = B; j < X; ++j) {
            if (check(s[i], s[j])) {
                adj[i] |= (1LL << j); // this would result in long longs?
            }
        }
    }
    // solve the problem
    for (int i = 0; i <= K; ++i) { // looping through the possible size of the subset chosen for the right half
        // finding the DP values
        int D = K - i;
//        debug(i, D);
        for (int j = 0; j < (1 << B); ++j) { // 2^B * B
            int cb = bitm::countbits(j);
            dp[j] = 0;
            if (cb < D) {
                continue;
            } else if (cb == D) {
                ll mask = -1;
                int sum = 0;
                for (int k = 0; k < B; ++k) {
                    if (j & (1 << k)) {
                        ll m = (1 << k) | adj[k];
                        if (mask == -1) mask = m;
                        else mask &= m;
                        sum += s[k].sum;
                    }
                }
                if (j == -1 || ((mask & j) == j)) { // checks if any pairs are compatible in O(1)
                    dp[j] = sum;
                }
            } else {
                for (int k = 0; k < B; ++k) { // updates further DP values
                    if (j & (1 << k)) {
                        ckmax(dp[j], dp[j ^ (1 << k)]);
                    }
                }
            }
        }
        // looping through all possible combinations
        string bitmask(i, 1);
        bitmask.rsz(X - B, 0);
        do {
            ll sum = 0, mask = ((1 << B) - 1), conv = 0;
            ll check = -1;
//            debug(bitmask);
            for (int j = 0; j < X - B; ++j) {
                if (bitmask[j] == 1) {
                    conv |= (1LL << (j + B));
                    if (check == -1) check = (1LL << (j + B)) | adj[j + B];
                    else check &= (1LL << (j + B)) | adj[j + B];
                    sum += s[j + B].sum;
                    mask &= cross[j + B];
                }
            }
            if (check != -1 && ((check & conv) != conv)) continue;
            int cb = bitm::countbits((ull)mask);
            //debug(mask);
            if (cb < D) continue;
            ll tot = dp[mask] + sum;
//            debug(tot, dp[mask]);
//            debug(conv);
            ckmax(ANS, tot);
        } while (prev_permutation(all(bitmask)));
    }
    cout << SUM - ANS;
    return 0;
}
// TLE -> TRY NOT USING DEFINE INT LONG LONG
// CE -> CHECK LINE 45
// 5000 * 5000 size matrices are kinda big (potential mle)
// Do something, start simpler
Compilation message (stderr)
domino.cpp: In function 'void setIO(const string&)':
domino.cpp:89:12: warning: ignoring return value of 'FILE* freopen(const char*, const char*, FILE*)' declared with attribute 'warn_unused_result' [-Wunused-result]
   89 |     freopen((str + ".in").c_str(), "r", stdin);
      |     ~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
domino.cpp:90:12: warning: ignoring return value of 'FILE* freopen(const char*, const char*, FILE*)' declared with attribute 'warn_unused_result' [-Wunused-result]
   90 |     freopen((str + ".out").c_str(), "w", stdout);
      |     ~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... | 
| # | Verdict | Execution time | Memory | Grader output | 
|---|
| Fetching results... |