Commit 4f36d0c6 authored by Nathan VanBenschoten's avatar Nathan VanBenschoten Committed by GitHub

Merge pull request #46154 from nvanbenschoten/backport19.2-46085

release-19.2: kv/kvserver: use LAI from before split when setting initialMaxClosed
parents c31e1185 fc34bc61
......@@ -227,7 +227,7 @@ func TestClosedTimestampCanServeAfterSplitAndMerges(t *testing.T) {
lRepls := replsForRange(ctx, t, tc, lr)
rRepls := replsForRange(ctx, t, tc, rr)
// Now immediately query both the ranges and there's 1 value per range.
// We need to tollerate RangeNotFound as the split range may not have been
// We need to tolerate RangeNotFound as the split range may not have been
// created yet.
baReadL := makeReadBatchRequestForDesc(lr, ts)
require.Nil(t, verifyCanReadFromAllRepls(ctx, t, baReadL, lRepls,
......@@ -235,6 +235,7 @@ func TestClosedTimestampCanServeAfterSplitAndMerges(t *testing.T) {
baReadR := makeReadBatchRequestForDesc(rr, ts)
require.Nil(t, verifyCanReadFromAllRepls(ctx, t, baReadR, rRepls,
respFuncs(retryOnRangeNotFound, expectRows(1))))
// Now merge the ranges back together and ensure that there's two values in
// the merged range.
merged, err := tc.Server(0).MergeRanges(lr.StartKey.AsRawKey())
......
......@@ -102,6 +102,44 @@ func splitPreApply(
if err := rsl.SynthesizeRaftState(ctx, eng); err != nil {
log.Fatal(ctx, err)
}
// The initialMaxClosed is assigned to the RHS replica to ensure that
// follower reads do not regress following the split. After the split occurs
// there will be no information in the closedts subsystem about the newly
// minted RHS range from its leaseholder's store. Furthermore, the RHS will
// have a lease start time equal to that of the LHS which might be quite
// old. This means that timestamps which follow the least StartTime for the
// LHS part are below the current closed timestamp for the LHS would no
// longer be readable on the RHS after the split.
//
// It is necessary for correctness that the call to maxClosed used to
// determine the current closed timestamp happens during the splitPreApply
// so that it uses a LAI that is _before_ the index at which this split is
// applied. If it were to refer to a LAI equal to or after the split then
// the value of initialMaxClosed might be unsafe.
//
// Concretely, any closed timestamp based on an LAI that is equal to or
// above the split index might be larger than the initial closed timestamp
// assigned to the RHS range's initial leaseholder. This is because the LHS
// range's leaseholder could continue closing out timestamps at the split's
// LAI after applying the split. Slow followers in that range could hear
// about these closed timestamp notifications before applying the split
// themselves. If these slow followers were allowed to pass these closed
// timestamps created after the split to the RHS replicas they create during
// the application of the split then these RHS replicas might end up with
// initialMaxClosed values above their current range's official closed
// timestamp. The leaseholder of the RHS range could then propose a write at
// a timestamp below this initialMaxClosed, violating the closed timestamp
// systems most important property.
//
// Using an LAI from before the index at which this split is applied avoids
// the hazard and ensures that no replica on the RHS is created with an
// initialMaxClosed that could be violated by a proposal on the RHS's
// initial leaseholder. See #44878.
initialMaxClosed := r.maxClosed(ctx)
rightRepl.mu.Lock()
rightRepl.mu.initialMaxClosed = initialMaxClosed
rightRepl.mu.Unlock()
}
// splitPostApply is the part of the split trigger which coordinates the actual
......@@ -136,26 +174,13 @@ func splitPostApply(
log.Fatal(ctx, err)
}
// This initialMaxClosedValue is created here to ensure that follower reads
// do not regress following the split. After the split occurs there will be no
// information in the closedts subsystem about the newly minted RHS range from
// its leaseholder's store. Furthermore, the RHS will have a lease start time
// equal to that of the LHS which might be quite old. This means that
// timestamps which follow the least StartTime for the LHS part are below the
// current closed timestamp for the LHS would no longer be readable on the RHS
// after the split. It is critical that this call to maxClosed happen during
// the splitPostApply so that it refers to a LAI that is equal to the index at
// which this lease was applied. If it were to refer to a LAI after the split
// then the value of initialMaxClosed might be unsafe.
initialMaxClosed := r.maxClosed(ctx)
r.mu.Lock()
r.mu.RLock()
rightRng.mu.Lock()
// Copy the minLeaseProposedTS from the LHS.
rightRng.mu.minLeaseProposedTS = r.mu.minLeaseProposedTS
rightRng.mu.initialMaxClosed = initialMaxClosed
rightLease := *rightRng.mu.state.Lease
rightRng.mu.Unlock()
r.mu.Unlock()
r.mu.RUnlock()
// We need to explicitly wake up the Raft group on the right-hand range or
// else the range could be underreplicated for an indefinite period of time.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment