A library for working with phylogenetic and population genetic data.
v0.32.0
fst_cathedral.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
35 
36 #include <cassert>
37 #include <cmath>
38 #include <cstdint>
39 #include <stdexcept>
40 #include <utility>
41 
42 namespace genesis {
43 namespace population {
44 
45 // =================================================================================================
46 // Fst Cathedral Accumulator
47 // =================================================================================================
48 
50 {
51  auto const pi_w_finite = std::isfinite( entry.pi_within );
52  auto const pi_b_finite = std::isfinite( entry.pi_between );
53  auto const pi_t_finite = std::isfinite( entry.pi_total );
54  return pi_w_finite && pi_b_finite && pi_t_finite;
55 }
56 
58 {
59  if( ! all_finite_( entry )) {
60  return;
61  }
62  pi_within_sum_ += entry.pi_within;
63  pi_between_sum_ += entry.pi_between;
64  pi_total_sum_ += entry.pi_total;
65  ++value_count_;
66 }
67 
69 {
70  // Boundary cases.
71  if( ! all_finite_( entry )) {
72  return;
73  }
74  if( value_count_ == 0 ) {
75  throw std::runtime_error(
76  "FstCathedralAccumulator: Cannot dissipate with value_count_==0"
77  );
78  }
79 
80  // Remove the values from the accumulator.
81  pi_within_sum_ -= entry.pi_within;
82  pi_between_sum_ -= entry.pi_between;
83  pi_total_sum_ -= entry.pi_total;
84  --value_count_;
85 
86  // Special case: all was removed. Even though we are using compensated summation,
87  // this serves as a small additional hard reset for cases where we know the state.
88  if( value_count_ == 0 ) {
89  reset();
90  }
91 }
92 
94 {
95  double result = 0.0;
96  switch( fst_estimator_ ) {
98  result = 1.0 - ( pi_within_sum_ / pi_total_sum_ );
99  break;
100  }
102  result = 1.0 - ( pi_within_sum_ / pi_between_sum_ );
103  break;
104  }
105  default: {
106  throw std::invalid_argument(
107  "FstCathedralAccumulator: Invalid FstPoolCalculatorUnbiased::Estimator"
108  );
109  }
110  }
111  return result;
112 }
113 
115 {
116  pi_within_sum_ = 0.0;
117  pi_between_sum_ = 0.0;
118  pi_total_sum_ = 0.0;
119 }
120 
121 // =================================================================================================
122 // Compute Data Functions
123 // =================================================================================================
124 
125 std::vector<FstCathedralPlotRecord> prepare_fst_cathedral_records_for_chromosome_(
126  std::string const& chromosome,
127  FstPoolProcessor const& processor,
129  std::vector<std::string> const& sample_names
130 ) {
131  // Make as many fst data objects as we have pairs of samples in the processor.
132  // Each item in the vector is one pair of samples, contaning their per-position partial fst data.
133  // We cannot resize the value field inside the results, as we do not know how large the
134  // chromosome is that we are using beforehand.
135  std::vector<FstCathedralPlotRecord> result;
136  result.resize( processor.size() );
137  assert( processor.size() == processor.calculators().size() );
138 
139  // If sample names are given, use those to name the results.
140  auto sample_name_pairs = fst_pool_processor_sample_names( processor, sample_names );
141  assert( sample_name_pairs.empty() || sample_name_pairs.size() == processor.size() );
142 
143  // We are at some chromosome now. We want to iterate while on that chromosome.
144  // We have a seq dict, so we know the length of the chromosome. First thought was to use that
145  // to pre-allocate the value vectors, but that might waaay over-allocate, as likely non-snp
146  // positions will be filtered out beforehand in the input iterator. So we don't do that.
147  auto const fst_name = fst_pool_unbiased_estimator_to_string( fst_estimator );
148  for( size_t i = 0; i < result.size(); ++i ) {
149  auto& record = result[i];
150 
151  record.chromosome_name = chromosome;
152  record.fst_estimator = fst_estimator;
153 
154  // If we have sample names, use them.
155  if( !sample_name_pairs.empty() ) {
156  assert( sample_name_pairs.size() == result.size() );
157  record.sample_name_1 = std::move( sample_name_pairs[i].first );
158  record.sample_name_2 = std::move( sample_name_pairs[i].second );
159  }
160 
161  // We also make a title, for user convenience, to be used in the plot by default.
162  // Could make that configurable, but good enough for now.
163  record.title = "Fst (" + fst_name + ")";
164  if( !record.sample_name_1.empty() && !record.sample_name_2.empty() ) {
165  record.title += " " + record.sample_name_1 + " vs " + record.sample_name_2;
166  record.plot_name = record.sample_name_1 + "." + record.sample_name_2;
167  } else {
168  record.plot_name = "plot";
169  }
170  if( !record.chromosome_name.empty() ) {
171  record.title += ", chromosome: " + record.chromosome_name;
172  }
173  }
174  return result;
175 }
176 
178  FstPoolProcessor const& processor,
179  std::vector<FstCathedralPlotRecord>& records,
180  size_t position
181 ) {
182  // We need to cast the calculators in the processor to get the correct derived type,
183  // so that we can access the partial pi values from there.
184  // Bit hacky, but good enough for now. Then, store the results.
185  assert( processor.size() == records.size() );
186  for( size_t i = 0; i < processor.size(); ++i ) {
187  auto const& raw_calc = processor.calculators()[i];
188  auto const* fst_calc = dynamic_cast<FstPoolCalculatorUnbiased const*>( raw_calc.get() );
189  if( ! fst_calc ) {
190  throw std::runtime_error(
191  "In compute_fst_cathedral_records_for_chromosome(): "
192  "Invalid FstPoolCalculator that is not FstPoolCalculatorUnbiased"
193  );
194  }
195 
196  // We are only using the sum at the moment, to keep it simple.
197  // The below function to get the pi values is a shortcut without window averaging,
198  // which always returns the sum anyway, so the policy here is ignored either way.
199  // But we check it, in order to make sure the user set up everything as intended.
200  if( fst_calc->get_window_average_policy() != WindowAveragePolicy::kSum ) {
201  throw std::runtime_error(
202  "In compute_fst_cathedral_records_for_chromosome(): "
203  "Invalid FstPoolCalculator that is not using WindowAveragePolicy::kSum"
204  );
205  }
206 
207  // Now add the entry for the current calculator to its respective records entry.
208  // We rely on the amortized complexity here - cannot pre-allocate the size,
209  // as we do not know how many positions will actually be in the input beforehand.
210  // For the window averaging, we do not need the window details or mask,
211  // so we use the overload here that does no window averaging.
212  auto const pis = fst_calc->get_pi_values();
213  records[i].entries.emplace_back(
214  position, pis.pi_within, pis.pi_between, pis.pi_total
215  );
216  }
217 }
218 
219 std::vector<FstCathedralPlotRecord> compute_fst_cathedral_records_for_chromosome(
221  FstPoolProcessor& processor,
223  std::vector<std::string> const& sample_names,
224  std::shared_ptr<genesis::sequence::SequenceDict> const& sequence_dict
225 ) {
226  // Boundary check.
227  if( ! iterator ) {
228  return {};
229  }
230 
231  // Prepare a vector of records, one for each fst calculator, with their respective meta-data.
232  std::string const chromosome = iterator->chromosome;
234  chromosome, processor, fst_estimator, sample_names
235  );
236  assert( result.size() == processor.size() );
237 
238  // Process all variants in the input as long as we are on the same chromosome,
239  // and run them through the processor, storing all results in the result.
240  size_t cur_pos = 0;
241  while( iterator && iterator->chromosome == chromosome ) {
242 
243  // Process a single Variant, so reset at every step.
244  auto const& variant = *iterator;
245  processor.reset();
246  processor.process( variant );
247 
248  // Make sure that we are in order. Otherwise the whole downstream approach fails.
249  if( variant.position <= cur_pos ) {
250  throw std::runtime_error(
251  "Unsorted positions in input: On chromosome \"" + chromosome + "\", position " +
252  std::to_string( variant.position ) + " follows position " +
253  std::to_string( cur_pos ) + ", which is not in strict ordering."
254  );
255  }
256  cur_pos = variant.position;
257 
258  // Create entries in the records of each processor, and move to next position.
259  fill_fst_cathedral_records_from_processor_( processor, result, variant.position );
260  ++iterator;
261  }
262 
263  // If we have an entry for it in the seq dict, we use that as the total length,
264  // so that downstream plots show the correct length. If not, we use what's in the data.
265  if( sequence_dict && sequence_dict->contains( chromosome )) {
266  cur_pos = sequence_dict->get( chromosome ).length;
267  }
268  for( auto& data : result ) {
269  data.chromosome_length = cur_pos;
270  }
271 
272  return result;
273 }
274 
275 std::vector<FstCathedralPlotRecord> compute_fst_cathedral_records(
276  VariantInputStream& iterator,
277  FstPoolProcessor& processor,
279  std::vector<std::string> const& sample_names,
280  std::shared_ptr<genesis::sequence::SequenceDict> const& sequence_dict
281 ) {
282  // We make one big result vector with all entries from all pairs of samples and chromosomes.
283  std::vector<FstCathedralPlotRecord> result;
284 
285  // Start the iteration, process each chromosome, and move over the results.
286  auto it = iterator.begin();
287  while( it ) {
289  it, processor, fst_estimator, sample_names, sequence_dict
290  );
291  assert( chr_result.size() == processor.size() );
292 
293  // Move the data for one chromosome (for each pair of samples) to the result.
294  result.reserve( result.size() + chr_result.size() );
295  for( size_t i = 0; i < chr_result.size(); ++i ) {
296  result.push_back( std::move( chr_result[i] ));
297  }
298  }
299  assert( it == iterator.end() );
300  return result;
301 }
302 
303 // =================================================================================================
304 // Storage Functions
305 // =================================================================================================
306 
308  FstCathedralPlotRecord const& record
309 ) {
310  using namespace genesis::utils;
311 
312  // Get the base class fields. This also sets up the document.
313  auto document = cathedral_plot_record_to_json_document( record );
314 
315  // We expect a top-level Json object, to be filled with our data.
316  auto& obj = document.get_object();
317  auto const fst_name = fst_pool_unbiased_estimator_to_string( record.fst_estimator );
318  obj["sampleName1"] = JsonDocument::string( record.sample_name_1 );
319  obj["sampleName2"] = JsonDocument::string( record.sample_name_2 );
320  obj["fstEstimator"] = JsonDocument::string( fst_name );
321  obj["entryCount"] = JsonDocument::number_unsigned( record.entries.size() );
322 
323  return document;
324 }
325 
326 } // namespace population
327 } // namespace genesis
genesis::population::FstPoolProcessor::size
size_t size() const
Get the total number of calculates, i.e., the number of pairs of samples for which we comute FST here...
Definition: fst_pool_processor.hpp:172
genesis::population::FstPoolProcessor
Helper class to iterate over Variants and process pairs of FST between their samples (SampleCounts),...
Definition: fst_pool_processor.hpp:67
genesis::sequence::SequenceDict::Entry::length
size_t length
Definition: sequence_dict.hpp:74
genesis::population::FstPoolProcessor::calculators
std::vector< std::unique_ptr< BaseFstPoolCalculator > > const & calculators() const
Definition: fst_pool_processor.hpp:391
genesis::population::FstCathedralPlotRecord::entries
std::vector< Entry > entries
Definition: fst_cathedral.hpp:92
genesis::sequence::SequenceDict::contains
bool contains(std::string const &name) const
Definition: sequence_dict.hpp:252
genesis::population::compute_fst_cathedral_records_for_chromosome
std::vector< FstCathedralPlotRecord > compute_fst_cathedral_records_for_chromosome(VariantInputStream::Iterator &iterator, FstPoolProcessor &processor, FstPoolCalculatorUnbiased::Estimator fst_estimator, std::vector< std::string > const &sample_names, std::shared_ptr< genesis::sequence::SequenceDict > const &sequence_dict)
Compute the components of per-position FST data for all pairs of samples in the given processor,...
Definition: fst_cathedral.cpp:219
genesis::population::FstCathedralPlotRecord::Entry::pi_between
double pi_between
Definition: fst_cathedral.hpp:87
genesis::utils::GenericInputStream::begin
Iterator begin()
Definition: generic_input_stream.hpp:852
genesis::population::fill_fst_cathedral_records_from_processor_
void fill_fst_cathedral_records_from_processor_(FstPoolProcessor const &processor, std::vector< FstCathedralPlotRecord > &records, size_t position)
Definition: fst_cathedral.cpp:177
genesis::population::FstCathedralAccumulator::dissipate
void dissipate(FstCathedralPlotRecord::Entry const &entry)
Definition: fst_cathedral.cpp:68
genesis::population::FstPoolProcessor::reset
void reset()
Definition: fst_pool_processor.hpp:179
genesis::population::FstCathedralPlotRecord::sample_name_2
std::string sample_name_2
Definition: fst_cathedral.hpp:96
genesis::utils::JsonDocument
Store a Json value of any kind.
Definition: json/document.hpp:114
genesis::population::FstPoolCalculatorUnbiased
Compute our unbiased F_ST statistic for pool-sequenced data for two ranges of SampleCountss.
Definition: fst_pool_unbiased.hpp:82
genesis::population::FstPoolCalculatorUnbiased::Estimator
Estimator
Definition: fst_pool_unbiased.hpp:90
genesis::population::fst_cathedral_plot_record_to_json_document
genesis::utils::JsonDocument fst_cathedral_plot_record_to_json_document(FstCathedralPlotRecord const &record)
Get a user-readable description of the data of a FstCathedralPlotRecord as a JsonDocument.
Definition: fst_cathedral.cpp:307
genesis::population::fst_pool_processor_sample_names
std::vector< std::pair< std::string, std::string > > fst_pool_processor_sample_names(FstPoolProcessor const &processor, std::vector< std::string > const &sample_names)
Return a list of sample name pairs for each calculator in an FstPoolProcessor.
Definition: fst_pool_processor.hpp:578
genesis::utils::GenericInputStream::end
Iterator end()
Definition: generic_input_stream.hpp:861
genesis::population::FstCathedralAccumulator::accumulate
void accumulate(FstCathedralPlotRecord::Entry const &entry)
Definition: fst_cathedral.cpp:57
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
window_average.hpp
document.hpp
genesis::population::WindowAveragePolicy::kSum
@ kSum
Simply report the total sum, with no averaging, i.e., the absolute value of the metric.
genesis::population::FstPoolCalculatorUnbiased::Estimator::kHudson
@ kHudson
genesis::population::FstCathedralAccumulator::reset
void reset()
Definition: fst_cathedral.cpp:114
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::GenericInputStream::Iterator
Internal iterator over the data.
Definition: generic_input_stream.hpp:196
genesis::population::prepare_fst_cathedral_records_for_chromosome_
std::vector< FstCathedralPlotRecord > prepare_fst_cathedral_records_for_chromosome_(std::string const &chromosome, FstPoolProcessor const &processor, FstPoolCalculatorUnbiased::Estimator fst_estimator, std::vector< std::string > const &sample_names)
Definition: fst_cathedral.cpp:125
genesis::population::FstPoolProcessor::process
void process(Variant const &variant)
Definition: fst_pool_processor.hpp:213
genesis::population::FstCathedralPlotRecord::fst_estimator
FstPoolCalculatorUnbiased::Estimator fst_estimator
Definition: fst_cathedral.hpp:101
genesis::population::FstCathedralPlotRecord::Entry
Definition: fst_cathedral.hpp:76
genesis::population::fst_pool_unbiased_estimator_to_string
std::string fst_pool_unbiased_estimator_to_string(FstPoolCalculatorUnbiased::Estimator estimator)
Definition: fst_pool_unbiased.hpp:446
genesis::population::FstCathedralPlotRecord::sample_name_1
std::string sample_name_1
Definition: fst_cathedral.hpp:95
genesis::population::cathedral_plot_record_to_json_document
genesis::utils::JsonDocument cathedral_plot_record_to_json_document(CathedralPlotRecord const &record)
Get a user-readable description of the data of a CathedralPlotRecord as a JsonDocument.
Definition: cathedral_plot.cpp:196
genesis::population::FstCathedralPlotRecord
Data for making one FST cathedral plot, that is, one pair of samples and one chromosome.
Definition: fst_cathedral.hpp:74
genesis::sequence::SequenceDict::get
Entry const & get(std::string const &name) const
Definition: sequence_dict.hpp:233
fst_cathedral.hpp
genesis::population::FstCathedralPlotRecord::Entry::pi_within
double pi_within
Definition: fst_cathedral.hpp:86
genesis::population::FstCathedralPlotRecord::Entry::pi_total
double pi_total
Definition: fst_cathedral.hpp:88
genesis::population::all_finite_
bool all_finite_(FstCathedralPlotRecord::Entry const &entry)
Definition: fst_cathedral.cpp:49
genesis::population::FstPoolCalculatorUnbiased::Estimator::kNei
@ kNei
genesis::utils::GenericInputStream
Type erasure for iterators, using std::function to eliminate the underlying input type.
Definition: generic_input_stream.hpp:163
genesis::population::FstCathedralAccumulator::aggregate
double aggregate() const
Definition: fst_cathedral.cpp:93
genesis::population::compute_fst_cathedral_records
std::vector< FstCathedralPlotRecord > compute_fst_cathedral_records(VariantInputStream &iterator, FstPoolProcessor &processor, FstPoolCalculatorUnbiased::Estimator fst_estimator, std::vector< std::string > const &sample_names, std::shared_ptr< genesis::sequence::SequenceDict > const &sequence_dict)
Compute the components of per-position FST data for all pairs of samples in the given processor,...
Definition: fst_cathedral.cpp:275