#include <bits/allocator.h>
#pragma GCC optimize("Ofast,unroll-loops")
#pragma GCC target("avx2")
#include "peru.h"
#include <bits/stdc++.h>
using namespace std;
const int N=2.5e6+10;
long long f[N], a[N];
const int mod=1e9+7;
int32_t solve(int32_t n, int32_t k, int32_t *aa){
multiset<long long> pq;
deque<int> st;
deque<int> val;
for (int i=1; i<=n; ++i) a[i]=aa[i-1];
long long ans=0;
int qid=0;
for (int i=1; i<=n; ++i){
int id=st.size();
while (id && a[st[id-1]]<=a[i]) --id;
int cnt=0;
long long mn=1e18;
for (int j=id; j<(int)st.size(); ++j){
++cnt;
pq.erase(pq.find(val[j]));
mn=min(mn, val[j]+a[i]-a[st[j]]);
}
while (cnt--) st.pop_back(), val.pop_back();
mn=min(mn, a[i]+f[i-1]);
st.push_back(i);
val.push_back(mn);
pq.insert(mn);
while (st.size() && st.front()<i-k+1){
pq.erase(pq.find(val[0]));
st.pop_front(), val.pop_front();
}
pq.erase(pq.find(val[0]));
val[0]=f[max((int)0, i-k)]+a[st[0]];
pq.insert(val[0]);
f[i]=*pq.begin();
ans=(ans*23+f[i])%mod;
}
return ans;
}