A library for working with phylogenetic and population genetic data.
v0.32.0
subsample.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
37 
38 #include <cassert>
39 #include <cmath>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace population {
44 
45 // =================================================================================================
46 // Scaling
47 // =================================================================================================
48 
50  SampleCounts& sample,
51  size_t target_depth,
52  bool skip_if_below_target_depth
53 ) {
54  // Get the total sum. If this does not exceed the max, we are done already.
55  size_t const total_sum = sample_counts_sum(sample);
56  if( skip_if_below_target_depth && total_sum <= target_depth ) {
57  return;
58  }
59 
60  // Scale the numbers, which rounds to the lower integer.
61  // We store the result in intermediate values first, as we might need the originals later.
62  double const scale = static_cast<double>(target_depth) / static_cast<double>(total_sum);
63  size_t a_count = static_cast<size_t>( static_cast<double>( sample.a_count ) * scale );
64  size_t c_count = static_cast<size_t>( static_cast<double>( sample.c_count ) * scale );
65  size_t g_count = static_cast<size_t>( static_cast<double>( sample.g_count ) * scale );
66  size_t t_count = static_cast<size_t>( static_cast<double>( sample.t_count ) * scale );
67  size_t n_count = static_cast<size_t>( static_cast<double>( sample.n_count ) * scale );
68  size_t d_count = static_cast<size_t>( static_cast<double>( sample.d_count ) * scale );
69 
70  // Now we might have some remainder due to the integer rounding, which we want to proportionally
71  // distribute across the numbers, so that the largest count gets most. We only processed 6
72  // numbers, so the remainder of the rounding is less than 6.
73  size_t const new_sum = a_count + c_count + g_count + t_count + n_count + d_count;
74  assert( new_sum <= target_depth );
75  size_t const remainder = target_depth - new_sum;
76  assert( remainder < 6 );
77 
78  // Now we distribute the remainder across the counts, starting at the highest, to stay
79  // as close as possible to proportional scaling. This is a bit expensive, but should be okay.
80  // Well yes, quick test shows that 1000 calls of this function take about 1ms... fast enough!
81  if( remainder > 0 ) {
82 
83  // First we compute the fractions of each nucleotide, and get a sorting order of this,
84  // based on the original counts. We also get pointers to the values for index access.
85  auto frac = std::array<double, 6>{{
86  static_cast<double>( sample.a_count ) / static_cast<double>( total_sum ),
87  static_cast<double>( sample.c_count ) / static_cast<double>( total_sum ),
88  static_cast<double>( sample.g_count ) / static_cast<double>( total_sum ),
89  static_cast<double>( sample.t_count ) / static_cast<double>( total_sum ),
90  static_cast<double>( sample.n_count ) / static_cast<double>( total_sum ),
91  static_cast<double>( sample.d_count ) / static_cast<double>( total_sum )
92  }};
93 
94  // We use a fixed length sorting algorithm here, as the standard sort is orders of magnitue
95  // slower, which would add significant runtime to using this function for every position
96  // in the genome.
97  // auto const sorting_order = utils::sort_indices( frac.begin(), frac.end() );
98  auto const sorting_order = sample_counts_sorting_order( frac );
99 
100  // We need index-based access to the counts, so we make a list of pointers.
101  auto const count_refs = std::array<size_t*, 6>{{
102  &a_count, &c_count, &g_count, &t_count, &n_count, &d_count
103  }};
104 
105  // Now we distribute, so that the remainder is split proportionally. We have 1-5 counts
106  // to distribute. We can think of this as splitting the unit interval into 1-5 intervals,
107  // and distribute one additional count per interval. To this end, we give extra counts
108  // to whichever nucleotide "dominates" that interval, i.e., holds the majority (more than
109  // half of the interval range). That can be done by simply checking which one has the
110  // largest fraction still, and reducing every time we use the fraction.
111  double const interval_len = 1.0 / static_cast<double>( remainder );
112  for( size_t i = 0; i < remainder; ++i ) {
113 
114  // Find the count that has the largest fraction still.
115  // We can stop early if the fraction is more than a whole interval, as in that case,
116  // it for sure will be the majority in the interval, as the fractions are sorted.
117  double max_f = 0.0;
118  size_t max_k = 0;
119  for( size_t k = 0; k < 6; ++k ) {
120  if( frac[sorting_order[k]] > max_f ) {
121  max_f = frac[sorting_order[k]];
122  max_k = k;
123  }
124  if( frac[sorting_order[k]] >= interval_len ) {
125  break;
126  }
127  }
128 
129  // Assign one of the remainder to that count, and reduce its fraction, so that in the
130  // next iteration, it does not contribute as much any more. This can set the fraction
131  // below zero, but that doesn't matter for the comparisons above. Any fraction that is
132  // reduced to below zero here had less than one intervals worth of range anyway, so
133  // it cannot be the majority in any further intervals later any more.
134  frac[sorting_order[max_k]] -= interval_len;
135  ++(*count_refs[sorting_order[max_k]]);
136  }
137  }
138 
139  // Now set the values of the sample to our computed counts.
140  sample.a_count = a_count;
141  sample.c_count = c_count;
142  sample.g_count = g_count;
143  sample.t_count = t_count;
144  sample.n_count = n_count;
145  sample.d_count = d_count;
146  assert( sample_counts_sum( sample ) == target_depth );
147 }
148 
150  SampleCounts& sample,
151  size_t max_depth
152 ) {
153  rescale_counts_( sample, max_depth, true );
154 }
155 
157  Variant& variant,
158  size_t max_depth
159 ) {
160  // Call the transformation for each sample in the variant.
161  for( auto& sample : variant.samples ) {
162  rescale_counts_( sample, max_depth, true );
163  }
164 }
165 
167  SampleCounts& sample,
168  size_t target_depth
169 ) {
170  rescale_counts_( sample, target_depth, false );
171 }
172 
174  Variant& variant,
175  size_t target_depth
176 ) {
177  // Call the transformation for each sample in the variant.
178  for( auto& sample : variant.samples ) {
179  rescale_counts_( sample, target_depth, false );
180  }
181 }
182 
183 // =================================================================================================
184 // Sampling
185 // =================================================================================================
186 
191 template<typename Distribution>
193  SampleCounts& sample,
194  size_t max_depth,
195  Distribution distribution,
196  bool skip_if_below_target_depth
197 ) {
198  // Get the total sum. If this does not exceed the max, we are done already.
199  size_t const total_sum = sample_counts_sum(sample);
200  if( skip_if_below_target_depth && total_sum <= max_depth ) {
201  return;
202  }
203 
204  // Make a random draw from a multivariate distrubiton with the counts.
205  auto const new_counts = distribution(
206  std::vector<size_t>{{
207  sample.a_count,
208  sample.c_count,
209  sample.g_count,
210  sample.t_count,
211  sample.n_count,
212  sample.d_count
213  }},
214  max_depth
215  );
216  assert( new_counts.size() == 6 );
217 
218  // Set the sample counts
219  sample.a_count = new_counts[0];
220  sample.c_count = new_counts[1];
221  sample.g_count = new_counts[2];
222  sample.t_count = new_counts[3];
223  sample.n_count = new_counts[4];
224  sample.d_count = new_counts[5];
225  assert( sample_counts_sum( sample ) == max_depth );
226 }
227 
229  SampleCounts& sample,
230  size_t max_depth
231 ) {
232  // Call the local helper function template, to avoid code duplication.
233  return resample_counts_<std::vector<size_t>(*)(std::vector<size_t> const&, size_t)>(
234  sample, max_depth, utils::multinomial_distribution, true
235  );
236 }
237 
239  Variant& variant,
240  size_t max_depth
241 ) {
242  // Call the transformation for each sample in the variant.
243  for( auto& sample : variant.samples ) {
244  subsample_counts_with_replacement( sample, max_depth );
245  }
246 }
247 
249  SampleCounts& sample,
250  size_t target_depth
251 ) {
252  // Call the local helper function template, to avoid code duplication.
253  return resample_counts_<std::vector<size_t>(*)(std::vector<size_t> const&, size_t)>(
254  sample, target_depth, utils::multinomial_distribution, false
255  );
256 }
257 
259  Variant& variant,
260  size_t target_depth
261 ) {
262  // Call the transformation for each sample in the variant.
263  for( auto& sample : variant.samples ) {
264  resample_counts( sample, target_depth );
265  }
266 }
267 
269  SampleCounts& sample,
270  size_t max_depth
271 ) {
272  // Call the local helper function template, to avoid code duplication.
273  return resample_counts_<std::vector<size_t>(*)(std::vector<size_t> const&, size_t)>(
274  sample, max_depth, utils::multivariate_hypergeometric_distribution, true
275  );
276 }
277 
279  Variant& variant,
280  size_t max_depth
281 ) {
282  // Call the transformation for each sample in the variant.
283  for( auto& sample : variant.samples ) {
284  subsample_counts_without_replacement( sample, max_depth );
285  }
286 }
287 
288 } // namespace population
289 } // namespace genesis
genesis::population::resample_counts
void resample_counts(SampleCounts &sample, size_t target_depth)
Resample all counts in a SampleCounts sample to a new target_depth.
Definition: subsample.cpp:248
algorithm.hpp
Provides some valuable algorithms that are not part of the C++ 11 STL.
genesis::population::rescale_counts_
void rescale_counts_(SampleCounts &sample, size_t target_depth, bool skip_if_below_target_depth)
Definition: subsample.cpp:49
subsample.hpp
functions.hpp
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
genesis::population::SampleCounts::a_count
size_type a_count
Count of all A nucleotides that are present in the sample.
Definition: sample_counts.hpp:66
distribution.hpp
genesis::utils::scale
void scale(Histogram &h, double factor)
Definition: operations.cpp:54
genesis::population::SampleCounts::t_count
size_type t_count
Count of all T nucleotides that are present in the sample.
Definition: sample_counts.hpp:81
genesis::population::resample_counts_
void resample_counts_(SampleCounts &sample, size_t max_depth, Distribution distribution, bool skip_if_below_target_depth)
Local helper function to avoid code duplication. Takes the distribution (with or without replacement)...
Definition: subsample.cpp:192
genesis::utils::multinomial_distribution
std::vector< size_t > multinomial_distribution(std::vector< size_t > const &p, size_t n)
Select a random sample following a multinomial distribution.
Definition: distribution.cpp:153
logging.hpp
Provides easy and fast logging functionality.
genesis::population::SampleCounts::n_count
size_type n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
Definition: sample_counts.hpp:86
genesis::population::SampleCounts::c_count
size_type c_count
Count of all C nucleotides that are present in the sample.
Definition: sample_counts.hpp:71
genesis::utils::multivariate_hypergeometric_distribution
std::vector< size_t > multivariate_hypergeometric_distribution(std::vector< size_t > const &p, size_t n)
Select a random sample following a multivariate hypergeometric distribution.
Definition: distribution.cpp:517
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::subsample_counts_with_replacement
void subsample_counts_with_replacement(SampleCounts &sample, size_t max_depth)
Transform a SampleCounts sample by subsampling the nucleotide counts (A, C, G, T, as well as N and D)...
Definition: subsample.cpp:228
genesis::population::subsample_counts_without_replacement
void subsample_counts_without_replacement(SampleCounts &sample, size_t max_depth)
Transform a SampleCounts sample by subsampling the nucleotide counts (A, C, G, T, as well as N and D)...
Definition: subsample.cpp:268
genesis::population::Variant::samples
std::vector< SampleCounts > samples
Definition: variant.hpp:82
genesis::population::subscale_counts
void subscale_counts(SampleCounts &sample, size_t max_depth)
Transform a SampleCounts sample by sub-scaling the base counts (A, C, G, T, as well as N and D) to su...
Definition: subsample.cpp:149
genesis::population::SampleCounts::d_count
size_type d_count
Count of all deleted (*) nucleotides that are present in the sample.
Definition: sample_counts.hpp:91
genesis::population::rescale_counts
void rescale_counts(SampleCounts &sample, size_t target_depth)
Transform a SampleCounts sample by re-scaling the base counts (A, C, G, T, as well as N and D) to sum...
Definition: subsample.cpp:166
genesis::population::sample_counts_sum
constexpr size_t sample_counts_sum(SampleCounts const &sample)
Sum up all the base counts at this sample, that is, the sum of all A, C, G, T, as well as the N and D...
Definition: population/function/functions.hpp:318
genesis::population::sample_counts_sorting_order
std::array< size_t, 6 > sample_counts_sorting_order(std::array< T, 6 > const &v)
Return the sorting order of six values, for instance of the four nucleotides ACGT and the N and D cou...
Definition: population/function/functions.hpp:165
genesis::population::SampleCounts::g_count
size_type g_count
Count of all G nucleotides that are present in the sample.
Definition: sample_counts.hpp:76