A library for working with phylogenetic and population genetic data.
v0.32.0
distribution.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
24 /*
25  Below, we include several functions with implementations derived from other GPL code:
26 
27  From the random package of Agner Fog:
28  multivariate_hypergeometric_distribution(), hypergeometric_distribution(),
29  hypergeometric_distribution_inversion_mode_(), hypergeometric_distribution_ratio_of_unifoms_()
30 
31  From the GSL (GNU Scientific Library):
32  multinomial_distribution() and hypergeometric_distribution_gsl()
33 
34  We have adapted their code for our needs. Below, we include the original copyright boilderplates
35  for all of the used functions, which all have been published under the GPL.
36  */
37 
38 /*
39  *************************** stoc1.cpp **********************************
40  * Author: Agner Fog
41  * Date created: 2002-01-04
42  * Last modified: 2008-11-30
43  * Project: stocc.zip
44  * Source URL: www.agner.org/random
45  *
46  * Description:
47  * Non-uniform random number generator functions.
48  *
49  * This file contains source code for the class StochasticLib1 defined in stocc.h.
50  *
51  * Documentation:
52  * ==============
53  * The file stocc.h contains class definitions.
54  * The file stocc.htm contains further instructions.
55  * The file distrib.pdf contains definitions of the statistic distributions.
56  * The file sampmet.pdf contains theoretical descriptions of the methods used
57  * for sampling from these distributions.
58  * The file ran-instructions.pdf contains general instructions.
59  *
60  * Copyright 2002-2008 by Agner Fog.
61  * GNU General Public License http://www.gnu.org/licenses/gpl.html
62  ****************************************************************************
63  */
64 
65 /*
66  * randist/multinomial.c
67  * Copyright (C) 2002 Gavin E. Crooks <gec@compbio.berkeley.edu>
68  *
69  * randist/hyperg.c
70  * Copyright (C) 1996, 1997, 1998, 1999, 2000, 2007 James Theiler, Brian Gough
71  *
72  * This program is free software; you can redistribute it and/or modify
73  * it under the terms of the GNU General Public License as published by
74  * the Free Software Foundation; either version 3 of the License, or (at
75  * your option) any later version.
76  *
77  * This program is distributed in the hope that it will be useful, but
78  * WITHOUT ANY WARRANTY; without even the implied warranty of
79  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
80  * General Public License for more details.
81  *
82  * You should have received a copy of the GNU General Public License
83  * along with this program; if not, write to the Free Software
84  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
85  */
86 
95 
98 
99 #include <cassert>
100 #include <cmath>
101 #include <cstdint>
102 #include <random>
103 #include <stdexcept>
104 #include <string>
105 
106 namespace genesis {
107 namespace utils {
108 
109 // =================================================================================================
110 // Multinomial Distribution
111 // =================================================================================================
112 
113 template<class T>
114 std::vector<size_t> multinomial_distribution_( std::vector<T> const& p, size_t n )
115 {
116  // The implementation follows the GSL function gsl_ran_multinomial(), under GPL 3
117  // https://www.gnu.org/software/gsl/doc/html/randist.html#the-multinomial-distribution
118  // See file gsl-2.7.1/randist/multinomial.c
119 
120  // Get the sum of all weights.
121  T norm = 0;
122  for( auto e : p ) {
123  norm += e;
124  }
125 
126  // For now, we use a global random engine (thread safe).
127  auto& engine = Options::get().random_engine();
128 
129  // Do the drawing, filling a result vector x.
130  auto x = std::vector<size_t>( p.size() );
131  T sum_p = 0;
132  size_t sum_n = 0;
133  for( size_t k = 0; k < p.size(); ++k ) {
134  if( p[k] > 0.0 ) {
135  assert( n >= sum_n );
136  assert( norm >= sum_p );
137 
138  // Need to cast to double here, so that this works with p of type size_t.
139  auto const binom_t = n - sum_n;
140  auto const binom_p = static_cast<double>( p[k] ) / static_cast<double>(norm - sum_p);
141  std::binomial_distribution<size_t> distrib( binom_t, binom_p );
142  x[k] = distrib( engine );
143  } else {
144  x[k] = 0;
145  }
146  sum_p += p[k];
147  sum_n += x[k];
148  }
149 
150  return x;
151 }
152 
153 std::vector<size_t> multinomial_distribution( std::vector<size_t> const& p, size_t n )
154 {
155  return multinomial_distribution_( p, n );
156 }
157 
158 std::vector<size_t> multinomial_distribution( std::vector<double> const& p, size_t n )
159 {
160  // Check the weights for validity.
161  for( auto e : p ) {
162  if( !std::isfinite(e) || e < 0.0 ) {
163  throw std::invalid_argument(
164  "Cannot compute multinomial distribution "
165  "if weights are not non-negative numbers: " + std::to_string( e )
166  );
167  }
168  }
169  return multinomial_distribution_( p, n );
170 }
171 
172 // =================================================================================================
173 // Hypergeometric Distribution
174 // =================================================================================================
175 
176 size_t hypergeometric_distribution_ratio_of_unifoms_( size_t n, size_t m, size_t N )
177 {
178  // Implementaiton based on Agner Fog, see https://www.agner.org/random/ published under GPL.
179  // Accessible at https://git.wur.nl/Biometris/articles/CRISPR_ABM/-/blob/master/stoc1.cpp
180 
181  // Subfunction for Hypergeometric distribution using the ratio-of-uniforms rejection method.
182  //
183  // The computation time hardly depends on the parameters, except that it matters a lot
184  // whether parameters are within the range where the log_factorial function is tabulated.
185  //
186  // Reference: E. Stadlober: "The ratio of uniforms approach for generating
187  // discrete random variates". Journal of Computational and Applied Mathematics,
188  // vol. 31, no. 1, 1990, pp. 181-189.
189 
190  // This code is valid for 0 < n <= m <= N/2. Assert this.
191  assert( 0 < n );
192  assert( n <= m );
193  assert( m <= N/2 );
194  assert( m + n <= N );
195 
196  // Helper function
197  auto fc_lnpk_ = []( size_t k, size_t L, size_t m, size_t n )
198  {
199  assert( m >= k );
200  assert( n >= k );
201  return log_factorial(k) + log_factorial(m - k) + log_factorial(n - k) + log_factorial(L + k);
202  };
203 
204  // rNN = 1/(N*(N+2)); mean = n*m/N
205  auto const L = N - m - n;
206  auto const rNN = 1.0 / (static_cast<double>(N) * static_cast<double>(N + 2));
207  auto const mean = (double)n * m * rNN * (N + 2);
208 
209  // mode = floor((n+1)*(m+1)/(N+2))
210  auto const mode = static_cast<size_t>(
211  static_cast<double>(n + 1) * static_cast<double>(m + 1) * rNN * N
212  );
213 
214  // variance
215  auto const var =
216  ( static_cast<double>(n) * m * (N - m) * (N - n) ) /
217  ( static_cast<double>(N) * N * (N - 1) )
218  ;
219 
220  // hat width h; hat center a; value at mode fm (maximum)
221  double const SHAT1 = 2.943035529371538573; // 8/e
222  double const SHAT2 = 0.8989161620588987408; // 3-sqrt(12/e)
223  auto const hyp_h = std::sqrt(SHAT1 * (var + 0.5)) + SHAT2;
224  auto const hyp_a = mean + 0.5;
225  auto const hyp_fm = fc_lnpk_(mode, L, m, n);
226 
227  // Safety upper bound
228  auto hyp_bound = static_cast<size_t>( hyp_a + 4.0 * hyp_h );
229  if( hyp_bound > n ) {
230  hyp_bound = n;
231  }
232 
233  // loop until accepted
234  auto& engine = Options::get().random_engine();
235  std::uniform_real_distribution<double> distrib( 0.0, 1.0 );
236  size_t k = 0;
237  while( true ) {
238  // uniform random number
239  double const u = distrib( engine );
240 
241  // avoid division by 0
242  if( u == 0 ) {
243  continue;
244  }
245 
246  // generate hat distribution, real sample
247  auto const x = hyp_a + hyp_h * (distrib( engine ) - 0.5) / u;
248 
249  if( x < 0. || x > 2E9 ) {
250  // reject, avoid overflow
251  continue;
252  }
253 
254  // integer sample
255  k = static_cast<size_t>(x);
256  if( k > hyp_bound ) {
257  // reject if outside range
258  continue;
259  }
260 
261  // ln(f(k))
262  auto const lf = hyp_fm - fc_lnpk_(k, L, m, n);
263  if( u * (4.0 - u) - 3.0 <= lf ) {
264  // lower squeeze accept
265  break;
266  }
267  if( u * (u - lf) > 1.0 ) {
268  // upper squeeze reject
269  continue;
270  }
271  if( 2.0 * log(u) <= lf ) {
272  // final acceptance
273  break;
274  }
275  }
276  return k;
277 }
278 
279 size_t hypergeometric_distribution_inversion_mode_( size_t n, size_t m, size_t N )
280 {
281  // Implementaiton based on Agner Fog, see https://www.agner.org/random/ published under GPL.
282  // Accessible at https://git.wur.nl/Biometris/articles/CRISPR_ABM/-/blob/master/stoc1.cpp
283 
284  // Subfunction for Hypergeometric distribution.
285  // Overflow protection is needed when N > 680 or n > 75.
286  //
287  // Hypergeometric distribution by inversion method, using down-up
288  // search starting at the mode using the chop-down technique.
289  //
290  // This method is faster than the rejection method when the variance is low.
291 
292  // Assumes 0 <= n <= m <= N/2.
293  assert( n <= m );
294  assert( m <= N/2 );
295  assert( N <= 680 );
296  assert( n <= 75 );
297 
298  // Helper constants
299  auto const Mp = static_cast<double>(m + 1);
300  auto const np = static_cast<double>(n + 1);
301  auto const p = Mp / (N + 2.0);
302  assert( N >= m + n );
303  auto const L = N - m - n;
304  double const L1 = static_cast<double>(L);
305 
306  // mode (real), mode (int), mode+1
307  auto const modef = np * p;
308  size_t hyp_mode = static_cast<size_t>( modef );
309  size_t hyp_mp = 0;
310  if( hyp_mode == modef && p == 0.5) {
311  hyp_mp = hyp_mode--;
312  } else {
313  hyp_mp = hyp_mode + 1;
314  }
315 
316  // mode probability, using log factorial function
317  assert( N >= n );
318  assert( N >= m );
319  assert( n >= hyp_mode );
320  assert( m >= hyp_mode );
321  auto const hyp_fm = std::exp(
322  log_factorial(N - m) - log_factorial(L + hyp_mode) - log_factorial(n - hyp_mode) +
323  log_factorial(m) - log_factorial(m - hyp_mode) - log_factorial(hyp_mode) -
324  log_factorial(N) + log_factorial(N - n) + log_factorial(n)
325  );
326 
327  // safety bound - guarantees at least 17 significant decimal digits
328  // bound = min(n, (int32_t)(modef + k*c'))
329  auto hyp_bound = static_cast<size_t>(
330  modef + 11.0 * std::sqrt( modef * (1.0 - p) * (1.0 - n / static_cast<double>(N)) + 1.0 )
331  );
332  if( hyp_bound > n) {
333  hyp_bound = n;
334  }
335 
336  // loop until accepted
337  auto& engine = Options::get().random_engine();
338  std::uniform_real_distribution<double> distrib( 0.0, 1.0 );
339  while( true ) {
340  // uniform random number to be converted
341  double U = distrib( engine );
342 
343  // start chop-down search at mode
344  if( (U -= hyp_fm) <= 0.0 ) {
345  return (hyp_mode);
346  }
347 
348  // factors in iteration
349  double c = hyp_fm;
350  double d = hyp_fm;
351 
352  // Loop counter, and float versions of it
353  size_t I;
354  double k1 = hyp_mp - 1;
355  double k2 = hyp_mode + 1;
356 
357  // alternating down- and upward search from the mode
358  for( I = 1; I <= hyp_mode; I++, k1--, k2++ ) {
359  // Downward search from k1 = hyp_mp - 1
360  // divisor, eliminated by scaling
361  auto divisor = (np - k1) * (Mp - k1);
362 
363  // Instead of dividing c with divisor, we multiply U and d because
364  // multiplication is faster. This will give overflow if N > 800
365  U *= divisor;
366  d *= divisor;
367  c *= k1 * (L1 + k1);
368  if( (U -= c) <= 0.0 ) {
369  return (hyp_mp - I - 1); // = k1 - 1
370  }
371 
372  // Upward search from k2 = hyp_mode + 1
373  divisor = k2 * (L1 + k2);
374 
375  // re-scale parameters to avoid time-consuming division
376  U *= divisor;
377  c *= divisor;
378  d *= (np - k2) * (Mp - k2);
379  if( (U -= d) <= 0.0 ) {
380  return (hyp_mode + I); // = k2
381  }
382  // Values of n > 75 or N > 680 may give overflow if you leave out this..
383  // overflow protection
384  // if( U > 1.E100) {U *= 1.E-100; c *= 1.E-100; d *= 1.E-100;}
385  }
386 
387  // Upward search from k2 = 2*mode + 1 to bound
388  for( k2 = I = hyp_mp + hyp_mode; I <= hyp_bound; I++, k2++ ) {
389  auto divisor = k2 * (L1 + k2);
390  U *= divisor;
391  d *= (np - k2) * (Mp - k2);
392  if( (U -= d) <= 0.0 ) {
393  return (I);
394  }
395  // more overflow protection
396  // if( U > 1.E100) {U *= 1.E-100; d *= 1.E-100;}
397  }
398  }
399 }
400 
401 size_t hypergeometric_distribution( size_t n1, size_t n2, size_t t )
402 {
403  // Implementaiton based on Agner Fog, see https://www.agner.org/random/ published under GPL.
404  // Accessible at https://git.wur.nl/Biometris/articles/CRISPR_ABM/-/blob/master/stoc1.cpp
405 
406  // We use the same arguments and order as the GSL function, for consistency,
407  // but here interally convert to the format used by the Agner Fog implementation.
408  // n = number of balls you take; m = number of red balls; N = total number of balls
409  size_t n = t;
410  size_t m = n1;
411  size_t N = n1 + n2;
412 
413  // Validity check
414  if( n > N ) {
415  throw std::invalid_argument(
416  "Invalid arguments for hypergeometric_distribution(), called with t == " +
417  std::to_string(t) + " > n1 + n2 == " + std::to_string( N ) +
418  ", as we cannot draw more values without replacement than there are values."
419  );
420  }
421 
422  // Symmetry transformations.
423  // We keep the values, so that we can undo the transformations later.
424  int fak = 1;
425  int64_t addd = 0;
426  if( m > N / 2 ) {
427  // invert m
428  m = N - m;
429  fak = -1;
430  addd = n;
431  }
432  if( n > N / 2 ) {
433  // invert n
434  n = N - n;
435  addd += fak * m;
436  fak = -fak;
437  }
438  if( n > m ) {
439  std::swap( n, m );
440  }
441  if( n == 0 ) {
442  // Cases with only one possible result end here
443  return addd;
444  }
445 
446  // Choose the method.
447  // This function uses inversion by chop-down search from the mode when
448  // parameters are small, and the ratio-of-uniforms method when the former
449  // method would be too slow or would give overflow.
450 
451  size_t x; // result
452  if( N > 680 || n > 70 ) {
453  // use ratio-of-uniforms method
455  } else {
456  // inversion method, using chop-down search from mode
458  }
459  // undo symmetry transformations
460  return x * fak + addd;
461 }
462 
463 size_t hypergeometric_distribution_gsl( size_t n1, size_t n2, size_t t )
464 {
465  // Unused at the moment, but kept here for reference.
466 
467  // The implementation follows the GSL function gsl_ran_hypergeometric(), under GPL 3
468  // https://www.gnu.org/software/gsl/doc/html/randist.html#the-hypergeometric-distribution
469  // See file gsl-2.7.1/randist/hyperg.c
470 
471  // An alternative more complex implementation is available by Agner Fog, see agner.org/random
472  // Accessible at https://git.wur.nl/Biometris/articles/CRISPR_ABM/-/blob/master/stoc1.cpp
473  // and published under GPL, which we implemented above, as it is faster for larger draws.
474 
475  // Boundary check
476  size_t const n = n1 + n2;
477  if( t > n ) {
478  t = n;
479  }
480 
481  // For now, we use a global random engine (thread safe).
482  auto& engine = Options::get().random_engine();
483  std::uniform_real_distribution<double> distrib( 0.0, 1.0 );
484 
485  size_t a = n1;
486  size_t b = n1 + n2;
487  size_t k = 0;
488  if( t < n / 2 ) {
489  for( size_t i = 0; i < t; i++ ) {
490  double const u = distrib( engine );
491  if( b * u < a ) {
492  k++;
493  if( k == n1 ) {
494  return k;
495  }
496  --a;
497  }
498  --b;
499  }
500  return k;
501  } else {
502  for( size_t i = 0; i < n - t; i++ ) {
503  double const u = distrib( engine );
504  if( b * u < a ) {
505  k++;
506  if( k == n1 ) {
507  return n1 - k;
508  }
509  --a;
510  }
511  --b;
512  }
513  return n1 - k;
514  }
515 }
516 
517 std::vector<size_t> multivariate_hypergeometric_distribution( std::vector<size_t> const& p, size_t n )
518 {
519  // Implementaiton based on Agner Fog, see https://www.agner.org/random/ published under GPL.
520  // Accessible at https://git.wur.nl/Biometris/articles/CRISPR_ABM/-/blob/master/stoc1.cpp
521 
522  // Prepare and check boundary conditions
523  auto x = std::vector<size_t>( p.size() );
524  if( p.empty() ) {
525  return x;
526  }
527 
528  // Compute total number of balls
529  size_t sum = 0;
530  for( auto e : p ) {
531  if( !std::isfinite(e) || e < 0.0 ) {
532  throw std::invalid_argument(
533  "Cannot compute multivariate hypergeometric distribution "
534  "if weights are not non-negative numbers: " + std::to_string( e )
535  );
536  }
537  sum += e;
538  }
539  if( n > sum ) {
540  throw std::invalid_argument(
541  "Cannot compute multivariate hypergeometric distribution "
542  "with n==" + std::to_string( n ) + " > sum(p)==" + std::to_string( sum ) +
543  ", as we cannot draw more values without replacement than there are values to draw."
544  );
545  }
546 
547  // Generate output by repeatedly drawing from the hypergeometric distribution
548  size_t i;
549  for( i = 0; i < p.size() - 1; ++i ) {
550  if( p[i] > 0.0 ) {
551  assert( sum >= p[i] );
552  x[i] = hypergeometric_distribution( p[i], sum - p[i], n );
553  n -= x[i];
554  sum -= p[i];
555  } else {
556  x[i] = 0;
557  }
558  }
559 
560  // Get the last value as the remainder
561  x[i] = n;
562  return x;
563 }
564 
565 } // namespace utils
566 } // namespace genesis
genesis::placement::swap
void swap(Sample &lhs, Sample &rhs)
Definition: sample.cpp:104
genesis::utils::hypergeometric_distribution_ratio_of_unifoms_
size_t hypergeometric_distribution_ratio_of_unifoms_(size_t n, size_t m, size_t N)
Definition: distribution.cpp:176
genesis::utils::log_factorial
double log_factorial(size_t n)
Return the logarithm (base e) of the factorial of n, that is log(n!).
Definition: binomial.cpp:445
binomial.hpp
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
distribution.hpp
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::utils::multinomial_distribution
std::vector< size_t > multinomial_distribution(std::vector< size_t > const &p, size_t n)
Select a random sample following a multinomial distribution.
Definition: distribution.cpp:153
genesis::utils::multivariate_hypergeometric_distribution
std::vector< size_t > multivariate_hypergeometric_distribution(std::vector< size_t > const &p, size_t n)
Select a random sample following a multivariate hypergeometric distribution.
Definition: distribution.cpp:517
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::Options::random_engine
std::default_random_engine & random_engine()
Return a thread-local engine for random number generation.
Definition: options.hpp:168
genesis::utils::mean
double mean(const Histogram &h)
Compute the bin-weighted arithmetic mean.
Definition: utils/math/histogram/stats.cpp:91
options.hpp
genesis::utils::hypergeometric_distribution_inversion_mode_
size_t hypergeometric_distribution_inversion_mode_(size_t n, size_t m, size_t N)
Definition: distribution.cpp:279
genesis::utils::Options::get
static Options & get()
Returns a single instance of this class.
Definition: options.hpp:68
genesis::utils::multinomial_distribution_
std::vector< size_t > multinomial_distribution_(std::vector< T > const &p, size_t n)
Definition: distribution.cpp:114
genesis::utils::hypergeometric_distribution_gsl
size_t hypergeometric_distribution_gsl(size_t n1, size_t n2, size_t t)
Definition: distribution.cpp:463
genesis::utils::hypergeometric_distribution
size_t hypergeometric_distribution(size_t n1, size_t n2, size_t t)
Select a random sample from a hypergeometric distribution.
Definition: distribution.cpp:401