1
# Copyright (c) 2009 Shawn Willden
2
# mailto:shawn@willden.org
3
# I hereby license all patches I have contributed or will contribute to
4
# the Allmydata Tahoe-LAFS project, including the file 'statistics.py',
5
# under both the GNU General Public License, version 2 or later, and
6
# under the Transitive Grace Period Public License, version 1 or later.
8
from __future__ import division
9
from mathutil import round_sigfigs
13
def pr_file_loss(p_list, k):
15
Probability of single-file loss for shares with reliabilities in
18
Computes the probability that a single file will become
19
unrecoverable, based on the individual share survival
20
probabilities and and k (number of shares needed for recovery).
22
Example: pr_file_loss([.9] * 5 + [.99] * 5, 3) returns the
23
probability that a file with k=3, N=10 and stored on five servers
24
with reliability .9 and five servers with reliability .99 is lost.
26
See survival_pmf docstring for important statistical assumptions.
29
assert 0 < k <= len(p_list)
30
assert valid_probability_list(p_list)
32
# Sum elements 0 through k-1 of the share set PMF to get the
33
# probability that less than k shares survived.
34
return sum(survival_pmf(p_list)[0:k])
36
def survival_pmf(p_list):
38
Return the collective PMF of share survival count for a set of
39
shares with the individual survival probabilities in p_list.
41
Example: survival_pmf([.99] * 10 + [.8] * 6) returns the
42
probability mass function for the number of shares that will
43
survive from an initial set of 16, 10 with p=0.99 and 6 with
44
p=0.8. The ith element of the resulting list is the probability
45
that exactly i shares will survive.
47
This calculation makes the following assumptions:
49
1. p_list[i] is the probability that any individual share will
50
will survive during the time period in question (whatever that may
53
2. The share failures are "independent", in the statistical
54
sense. Note that if a group of shares are stored on the same
55
machine or even in the same data center, they are NOT independent
56
and this calculation is therefore wrong.
58
assert valid_probability_list(p_list)
60
pmf = survival_pmf_via_conv(p_list)
65
def survival_pmf_via_bd(p_list):
67
Compute share survival PMF using the binomial distribution PMF as
70
This is more efficient than the convolution method below, but
71
doesn't work for large numbers of shares because the
72
binomial_coeff calculation blows up. Since the efficiency gains
73
only matter in the case of large numbers of shares, it's pretty
74
much useless except for testing the convolution methond.
76
Note that this function does little to no error checking and is
77
intended for internal use and testing only.
79
pmf_list = [ binomial_distribution_pmf(p_list.count(p), p)
80
for p in set(p_list) ]
81
return reduce(convolve, pmf_list)
83
def survival_pmf_via_conv(p_list):
85
Compute share survival PMF using iterated convolution of trivial
88
Note that this function does little to no error checking and is
89
intended for internal use and testing only.
91
pmf_list = [ [1 - p, p] for p in p_list ];
92
return reduce(convolve, pmf_list)
94
def print_pmf(pmf, n=4, out=sys.stdout):
96
Print a PMF in a readable form, with values rounded to n
99
for k, p in enumerate(pmf):
100
print >>out, "i=" + str(k) + ":", round_sigfigs(p, n)
102
def pr_backup_file_loss(p_list, backup_p, k):
104
Probability of single-file loss in a backup context
106
Same as pr_file_loss, except it factors in the probability of
107
survival of the original source, specified as backup_p. Because
108
that's a precondition to caring about the availability of the
109
backup, it's an independent event.
111
assert valid_probability_list(p_list)
112
assert 0 < backup_p <= 1
113
assert 0 < k <= len(p_list)
115
return pr_file_loss(p_list, k) * (1 - backup_p)
118
def find_k(p_list, target_loss_prob):
120
Find the highest k value that achieves the targeted loss
121
probability, given the share reliabilities given in p_list.
123
assert valid_probability_list(p_list)
124
assert 0 < target_loss_prob < 1
126
pmf = survival_pmf(p_list)
127
return find_k_from_pmf(pmf, target_loss_prob)
129
def find_k_from_pmf(pmf, target_loss_prob):
131
Find the highest k value that achieves the targeted loss
132
probability, given the share survival PMF given in pmf.
134
assert valid_pmf(pmf)
135
assert 0 < target_loss_prob < 1
138
for k, p_k in enumerate(pmf):
140
if loss_prob > target_loss_prob:
143
# we shouldn't be able to get here, since sum(pmf)==1.0
147
def repair_count_pmf(survival_pmf, k):
149
Return Pr[D=d], where D represents the number of shares that have
150
to be repaired at the end of an interval, starting with a full
151
set and subject to losses described in survival_pmf.
153
n = len(survival_pmf) - 1
155
# Probability of 0 to repair is the probability of all shares
156
# surviving plus the probability of less than k surviving.
157
pmf = [ survival_pmf[n] + sum(survival_pmf[0:k]) ]
159
# Probability of more than 0, up to N-k to repair
160
for i in range(1, n-k+1):
161
pmf.append(survival_pmf[n-i])
163
# Probability of more than N-k to repair is 0, because that means
164
# there are less than k available and the file is irreparable.
165
for i in range(n-k+1, n+1):
168
assert(valid_pmf(pmf))
171
def bandwidth_cost_function(file_size, shares, k, ul_dl_ratio):
172
return file_size + float(file_size) / k * shares * ul_dl_ratio
174
def mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio):
176
Return the expected cost for a repair run on a file with the given
177
survival_pmf and requiring k shares, in which upload cost is
178
'ul_dl_ratio' times download cost.
180
repair_pmf = repair_count_pmf(survival_pmf, k)
181
expected_cost = sum([cost_function(file_size, new_shares, k, ul_dl_ratio)
182
* repair_pmf[new_shares]
183
for new_shares in range(1, len(repair_pmf))])
186
def eternal_repair_cost(cost_function, file_size, survival_pmf, k,
187
discount_rate=0, ul_dl_ratio=1.0):
189
Calculate the eternal repair cost for a file that is aggressively
190
repaired, i.e. the sum of repair costs until the file is dead.
192
c = mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio)
193
f = 1 - sum(survival_pmf[0:k])
194
r = float(discount_rate)
196
return (c * (1-r)) / (1 - (1-r) * f)
200
Validate that pmf looks like a proper discrete probability mass
201
function in list form.
203
Returns true if the elements of pmf sum to 1.
205
return round(sum(pmf),5) == 1.0
207
def valid_probability_list(p_list):
209
Validate that p_list is a list of probibilities
217
def convolve(list_a, list_b):
219
Returns the discrete convolution of two lists.
221
Given two random variables X and Y, the convolution of their
222
probability mass functions Pr(X) and Pr(Y) is equal to the
229
for i in range(n + m - 1):
232
lower = max(0, i - n + 1)
233
upper = min(m - 1, i)
235
for j in range(lower, upper+1):
236
sum += list_a[i-j] * list_b[j]
242
def binomial_distribution_pmf(n, p):
244
Returns Pr(K), where K ~ B(n,p), as a list of values.
246
Returns the full probability mass function of a B(n, p) as a list
247
of values, where the kth element is Pr(K=k), or, in the Tahoe
248
context, the probability that exactly k copies of a file share
249
survive, when placed on n independent servers with survival
252
assert p >= 0 and p <= 1, 'p=%s must be in the range [0,1]'%p
257
result.append(math.pow(p , k ) *
258
math.pow(1 - p, n - k) *
259
binomial_coeff(n, k))
261
assert valid_pmf(result)
264
def binomial_coeff(n, k):
266
Returns the number of ways that k items can be chosen from a set
275
for i in range(1, k+1):
276
accum = accum * (n - k + i) // i;
278
return int(accum + 0.5)