Commit a9dfbe2c authored by Darin's avatar Darin Committed by Andrew Kimball

allocator: fix incorrect rebalance signal and target

This change fixes two bugs:
- The first bug is related to the comparison of floating point
numbers when considering to rebalance a range. The diversity score
of two identical ranges (even the same range) may differ by a small
amount. The fix is to consider candidates equal if their diversity score
have very close values (within 1e-10).
- The second bug has been introduced with
https://github.com/cockroachdb/cockroach/pull/39157 and resulted in
ignoring the newly added replica when determining what existing replica
it should replace in Allocator.simulateRemoveTarget.
As a result - when the first bug was triggering and unnecessary
rebalancing was happening - the replica to be dropped was chosen
at random (in the case where all existing replica are necessary).
For example - let's say a range is constrained to have a replica in each
of the regions A,B,C. We are trying to add a new replica in C.
Evaluating what should be removed from the A,B,C may choose any one of them.
Regardless of the choise, the constraint will be violated.
If we however ask the same question for A,B,C(old),C(new) - the result
will be C(old) as this will preserve the constraint (to have one
replica in each region).

Fixes #42715
Fixes https://github.com/cockroachlabs/support/issues/348

Release note (bug fix): the allocator now considers stores with very close
diversity scores equal (all other things being the same) and doesn't attempt
to rebalance.
Release note (bug fix): the allocator considers the new store being added
when looking for target in case of rebalance.
parent 36db1f34
......@@ -699,9 +699,9 @@ func (a Allocator) RebalanceTarget(
ReplicaID: maxReplicaID(existingReplicas) + 1,
}
// Deep-copy the Replicas slice since we'll mutate it below.
replicaCandidates := append([]roachpb.ReplicaDescriptor(nil), existingReplicas...)
replicaCandidates = append(replicaCandidates, newReplica)
existingPlusOneNew := append([]roachpb.ReplicaDescriptor(nil), existingReplicas...)
existingPlusOneNew = append(existingPlusOneNew, newReplica)
replicaCandidates := existingPlusOneNew
// If we can, filter replicas as we would if we were actually removing one.
// If we can't (e.g. because we're the leaseholder but not the raft leader),
// it's better to simulate the removal with the info that we do have than to
......@@ -724,7 +724,7 @@ func (a Allocator) RebalanceTarget(
target.store.StoreID,
zone,
replicaCandidates,
replicaCandidates,
existingPlusOneNew,
rangeUsageInfo,
)
if err != nil {
......
......@@ -26,6 +26,17 @@ import (
)
const (
// This is a somehow arbitrary chosen upper bound on the relative error to be
// used when comparing doubles for equality. The assumption that comes with it
// is that any sequence of operations on doubles won't produce a result with
// accuracy that is worse than this relative error. There is no guarantee
// however that this will be the case. A programmer writing code using
// floating point numbers will still need to be aware of the effect of the
// operations on the results and the possible loss of accuracy.
// More info https://en.wikipedia.org/wiki/Machine_epsilon
// https://en.wikipedia.org/wiki/Floating-point_arithmetic
epsilon = 1e-10
// The number of random candidates to select from a larger list of possible
// candidates. Because the allocator heuristics are being run on every node it
// is actually not desirable to set this value higher. Doing so can lead to
......@@ -158,7 +169,7 @@ func (c candidate) compare(o candidate) float64 {
}
return -4
}
if c.diversityScore != o.diversityScore {
if !scoresAlmostEqual(c.diversityScore, o.diversityScore) {
if c.diversityScore > o.diversityScore {
return 3
}
......@@ -170,7 +181,7 @@ func (c candidate) compare(o candidate) float64 {
}
return -(2 + float64(o.convergesScore-c.convergesScore)/10.0)
}
if c.balanceScore.totalScore() != o.balanceScore.totalScore() {
if !scoresAlmostEqual(c.balanceScore.totalScore(), o.balanceScore.totalScore()) {
if c.balanceScore.totalScore() > o.balanceScore.totalScore() {
return 1 + (c.balanceScore.totalScore()-o.balanceScore.totalScore())/10.0
}
......@@ -234,9 +245,9 @@ var _ sort.Interface = byScoreAndID(nil)
func (c byScoreAndID) Len() int { return len(c) }
func (c byScoreAndID) Less(i, j int) bool {
if c[i].diversityScore == c[j].diversityScore &&
if scoresAlmostEqual(c[i].diversityScore, c[j].diversityScore) &&
c[i].convergesScore == c[j].convergesScore &&
c[i].balanceScore.totalScore() == c[j].balanceScore.totalScore() &&
scoresAlmostEqual(c[i].balanceScore.totalScore(), c[j].balanceScore.totalScore()) &&
c[i].rangeCount == c[j].rangeCount &&
c[i].necessary == c[j].necessary &&
c[i].fullDisk == c[j].fullDisk &&
......@@ -267,7 +278,7 @@ func (cl candidateList) best() candidateList {
}
for i := 1; i < len(cl); i++ {
if cl[i].necessary == cl[0].necessary &&
cl[i].diversityScore == cl[0].diversityScore &&
scoresAlmostEqual(cl[i].diversityScore, cl[0].diversityScore) &&
cl[i].convergesScore == cl[0].convergesScore {
continue
}
......@@ -301,7 +312,7 @@ func (cl candidateList) worst() candidateList {
// Find the worst constraint/locality/converges values.
for i := len(cl) - 2; i >= 0; i-- {
if cl[i].necessary == cl[len(cl)-1].necessary &&
cl[i].diversityScore == cl[len(cl)-1].diversityScore &&
scoresAlmostEqual(cl[i].diversityScore, cl[len(cl)-1].diversityScore) &&
cl[i].convergesScore == cl[len(cl)-1].convergesScore {
continue
}
......@@ -801,11 +812,13 @@ func betterRebalanceTarget(target1, existing1, target2, existing2 *candidate) *c
// they'll replace.
comp1 := target1.compare(*existing1)
comp2 := target2.compare(*existing2)
if comp1 > comp2 {
return target1
}
if comp1 < comp2 {
return target2
if !scoresAlmostEqual(comp1, comp2) {
if comp1 > comp2 {
return target1
}
if comp1 < comp2 {
return target2
}
}
// If the two targets are equally better than their corresponding existing
// replicas, just return whichever target is better.
......@@ -1225,3 +1238,7 @@ func maxCapacityCheck(store roachpb.StoreDescriptor) bool {
func rebalanceToMaxCapacityCheck(store roachpb.StoreDescriptor) bool {
return store.Capacity.FractionUsed() < rebalanceToMaxFractionUsedThreshold
}
func scoresAlmostEqual(score1, score2 float64) bool {
return math.Abs(score1-score2) < epsilon
}
......@@ -14,6 +14,7 @@ import (
"bytes"
"context"
"fmt"
"math"
"math/rand"
"reflect"
"sort"
......@@ -115,7 +116,11 @@ func TestCandidateSelection(t *testing.T) {
store: roachpb.StoreDescriptor{
StoreID: roachpb.StoreID(i + idShift),
},
diversityScore: float64(score.diversity),
// We want to test here that everything works when the deversity score
// is not the exact value but very close to it. Nextafter will give us
// the closest number to the diversity score in the test case,
// that isn't equal to it and is either above or below (at random).
diversityScore: math.Nextafter(float64(score.diversity), rand.Float64()),
rangeCount: score.rangeCount,
valid: true,
})
......@@ -215,7 +220,7 @@ func TestCandidateSelection(t *testing.T) {
if good == nil {
t.Fatalf("no good candidate found")
}
actual := scoreTuple{int(good.diversityScore), good.rangeCount}
actual := scoreTuple{int(good.diversityScore + 0.5), good.rangeCount}
if actual != tc.good {
t.Errorf("expected:%v actual:%v", tc.good, actual)
}
......@@ -225,7 +230,7 @@ func TestCandidateSelection(t *testing.T) {
if bad == nil {
t.Fatalf("no bad candidate found")
}
actual := scoreTuple{int(bad.diversityScore), bad.rangeCount}
actual := scoreTuple{int(bad.diversityScore + 0.5), bad.rangeCount}
if actual != tc.bad {
t.Errorf("expected:%v actual:%v", tc.bad, actual)
}
......@@ -244,7 +249,7 @@ func TestBetterThan(t *testing.T) {
},
{
valid: true,
diversityScore: 1,
diversityScore: 0.9999999999999999,
rangeCount: 0,
},
{
......
......@@ -881,8 +881,8 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
storeFilterThrottled,
)
expTo := stores[1].StoreID
expFrom := map[roachpb.StoreID]bool{stores[3].StoreID: true, stores[4].StoreID: true}
if !ok || target.StoreID != expTo || !expFrom[origin.StoreID] {
expFrom := stores[0].StoreID
if !ok || target.StoreID != expTo || origin.StoreID != expFrom {
t.Fatalf("%d: expected rebalance from either of %v to s%d, but got %v->%v; details: %s",
i, expFrom, expTo, origin, target, details)
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment