326
326
that before). Because of this, it keeps rebalancing until the device
327
327
skew (number of partitions a device wants compared to what it has) gets
328
328
below 1% or doesn't change by more than 1% (only happens with ring that
329
can't be balanced no matter what -- like with 3 zones of differing
330
weights with replicas set to 3).
329
can't be balanced no matter what).
332
331
:returns: (number_of_partitions_altered, resulting_balance)
346
345
new_parts, removed_part_count = self._adjust_replica2part2dev_size()
347
346
retval += removed_part_count
347
if new_parts or removed_part_count:
348
self._set_parts_wanted()
348
349
self._reassign_parts(new_parts)
349
350
retval += len(new_parts)
515
516
# indicate its strong desire to give up everything it has.
516
517
dev['parts_wanted'] = -self.parts * self.replicas
518
dev['parts_wanted'] = \
519
int(weight_of_one_part * dev['weight']) - dev['parts']
519
dev['parts_wanted'] = (
520
# Round up here so that every partition ultimately ends up
523
# Imagine 5 partitions to be placed on 4 devices. If we
524
# didn't use math.ceil() here, each device would have a
525
# parts_wanted of 1, so 4 partitions would be placed but
526
# the last would not, probably resulting in a crash. This
527
# way, some devices end up with leftover parts_wanted, but
528
# at least every partition ends up somewhere.
529
int(math.ceil(weight_of_one_part * dev['weight'])) -
521
532
def _adjust_replica2part2dev_size(self):
584
595
self._replica2part2dev.append(
585
596
array('H', (0 for _junk in xrange(desired_length))))
587
return (list(to_assign.iteritems()), removed_replicas)
598
return (to_assign.items(), removed_replicas)
589
600
def _initial_balance(self):
752
763
replicas_to_replace may be shared for multiple
753
764
partitions, so be sure you do not modify it.
766
parts_available_in_tier = defaultdict(int)
755
767
for dev in self._iter_devs():
756
768
dev['sort_key'] = self._sort_key_for(dev)
757
dev['tiers'] = tiers_for_dev(dev)
769
tiers = tiers_for_dev(dev)
772
# Note: this represents how many partitions may be assigned to
773
# a given tier (region/zone/server/disk). It does not take
774
# into account how many partitions a given tier wants to shed.
776
# If we did not do this, we could have a zone where, at some
777
# point during assignment, number-of-parts-to-gain equals
778
# number-of-parts-to-shed. At that point, no further placement
779
# into that zone would occur since its parts_available_in_tier
780
# would be 0. This would happen any time a zone had any device
781
# with partitions to shed, which is any time a device is being
782
# removed, which is a pretty frequent operation.
783
parts_available_in_tier[tier] += max(dev['parts_wanted'], 0)
759
785
available_devs = \
760
786
sorted((d for d in self._iter_devs() if d['weight']),
793
819
# Gather up what other tiers (regions, zones, ip/ports, and
794
820
# devices) the replicas not-to-be-moved are in for this part.
795
821
other_replicas = defaultdict(int)
796
unique_tiers_by_tier_len = defaultdict(set)
822
occupied_tiers_by_tier_len = defaultdict(set)
797
823
for replica in self._replicas_for_part(part):
798
824
if replica not in replace_replicas:
799
825
dev = self.devs[self._replica2part2dev[replica][part]]
800
826
for tier in dev['tiers']:
801
827
other_replicas[tier] += 1
802
unique_tiers_by_tier_len[len(tier)].add(tier)
828
occupied_tiers_by_tier_len[len(tier)].add(tier)
804
830
for replica in replace_replicas:
831
# Find a new home for this replica
807
834
while depth <= max_tier_depth:
808
835
# Order the tiers by how many replicas of this
809
836
# partition they already have. Then, of the ones
810
# with the smallest number of replicas, pick the
811
# tier with the hungriest drive and then continue
812
# searching in that subtree.
837
# with the smallest number of replicas and that have
838
# room to accept more partitions, pick the tier with
839
# the hungriest drive and then continue searching in
814
842
# There are other strategies we could use here,
815
843
# such as hungriest-tier (i.e. biggest
817
845
# However, hungriest-drive is what was used here
818
846
# before, and it worked pretty well in practice.
820
# Note that this allocator will balance things as
821
# evenly as possible at each level of the device
822
# layout. If your layout is extremely unbalanced,
823
# this may produce poor results.
848
# Note that this allocator prioritizes even device
849
# filling over dispersion, so if your layout is
850
# extremely unbalanced, you may not get the replica
851
# dispersion that you expect, and your durability
825
854
# This used to be a cute, recursive function, but it's been
826
855
# unrolled for performance.
832
861
# short-circuit the search while still ensuring we get the
834
863
candidates_with_replicas = \
835
unique_tiers_by_tier_len[len(tier) + 1]
836
# Find a tier with the minimal replica count and the
837
# hungriest drive among all the tiers with the minimal
839
if len(tier2children[tier]) > \
864
occupied_tiers_by_tier_len[len(tier) + 1]
866
# Among the tiers with room for more partitions,
867
# find one with the smallest possible number of
868
# replicas already in it, breaking ties by which one
869
# has the hungriest drive.
870
candidates_with_room = [
871
t for t in tier2children[tier]
872
if parts_available_in_tier[t] > 0]
874
if len(candidates_with_room) > \
840
875
len(candidates_with_replicas):
841
# There exists at least one tier with 0 other replicas
842
tier = max((t for t in tier2children[tier]
876
# There exists at least one tier with room for
877
# another partition and 0 other replicas already
878
# in it, so we can use a faster search. The else
879
# branch's search would work here, but it's
880
# significantly slower.
881
tier = max((t for t in candidates_with_room
843
882
if other_replicas[t] == 0),
844
883
key=tier2sort_key.__getitem__)
846
tier = max(tier2children[tier],
885
tier = max(candidates_with_room,
847
886
key=lambda t: (-other_replicas[t],
848
887
tier2sort_key[t]))
853
892
old_sort_key = dev['sort_key']
854
893
new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
855
894
for tier in dev['tiers']:
895
parts_available_in_tier[tier] -= 1
856
896
other_replicas[tier] += 1
857
unique_tiers_by_tier_len[len(tier)].add(tier)
897
occupied_tiers_by_tier_len[len(tier)].add(tier)
859
899
index = bisect.bisect_left(tier2dev_sort_key[tier],