A library for working with phylogenetic and population genetic data.
v0.32.0
measures.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
42 
52 #include "genesis/tree/tree.hpp"
53 
58 
59 #include <algorithm>
60 #include <cassert>
61 #include <map>
62 #include <stdexcept>
63 
64 #ifdef GENESIS_OPENMP
65 # include <omp.h>
66 #endif
67 
68 namespace genesis {
69 namespace placement {
70 
71 // =================================================================================================
72 // Expected Distance between Placement Locations
73 // =================================================================================================
74 
75 double edpl( Pquery const& pquery, utils::Matrix<double> const& node_distances )
76 {
77  double result = 0.0;
78 
79  #pragma omp parallel for
80  for( size_t i = 0; i < pquery.placement_size(); ++i ) {
81 
82  #pragma omp parallel for
83  for( size_t j = i + 1; j < pquery.placement_size(); ++j ) {
84 
85  auto const& place_i = pquery.placement_at(i);
86  auto const& place_j = pquery.placement_at(j);
87 
88  auto const dist = placement_distance( place_i, place_j, node_distances );
89 
90  #pragma omp atomic
91  result += place_i.like_weight_ratio * place_j.like_weight_ratio * dist;
92  }
93  }
94  return 2 * result;
95 }
96 
97 std::vector<double> edpl( Sample const& sample, utils::Matrix<double> const& node_distances )
98 {
99  // Prepare result (facilitate copy elision).
100  auto result = std::vector<double>( sample.size(), 0 );
101 
102  // Fill result vector.
103  #pragma omp parallel for
104  for( size_t qi = 0; qi < sample.size(); ++qi ) {
105  auto const& pquery = sample.at( qi );
106  result[qi] = edpl( pquery, node_distances );
107  }
108  return result;
109 }
110 
111 double edpl( Sample const& sample, Pquery const& pquery )
112 {
113  auto const node_distances = node_branch_length_distance_matrix( sample.tree() );
114  return edpl( pquery, node_distances );
115 }
116 
117 std::vector<double> edpl( Sample const& sample )
118 {
119  auto const node_distances = node_branch_length_distance_matrix( sample.tree() );
120  return edpl( sample, node_distances );
121 }
122 
123 // =================================================================================================
124 // Pairwise Distance
125 // =================================================================================================
126 
128  const Sample& smp_a,
129  const Sample& smp_b,
130  bool with_pendant_length
131 ) {
132  if (!compatible_trees(smp_a, smp_b)) {
133  throw std::invalid_argument("pairwise_distance: Incompatible trees.");
134  }
135 
136  // Init.
137  double sum = 0.0;
138 
139  // Create PqueryPlain objects for every placement and copy all interesting data into it.
140  // This way, we won't have to do all the pointer dereferencing during the actual calculations,
141  // and furthermore, the data is close in memory. this gives a tremendous speedup!
142  std::vector<PqueryPlain> const pqueries_a = plain_queries( smp_a );
143  std::vector<PqueryPlain> const pqueries_b = plain_queries( smp_b );
144 
145  // Calculate a matrix containing the pairwise distance between all nodes. This way, we
146  // do not need to search a path between placements every time. We use the tree of the first smp
147  // here, ignoring branch lengths on tree b.
148  // FIXME this might be made better by using average or so in the future.
149  auto node_distances = node_branch_length_distance_matrix(smp_a.tree());
150 
151  for (const PqueryPlain& pqry_a : pqueries_a) {
152  for (const PqueryPlain& pqry_b : pqueries_b) {
153  auto dist = pquery_distance( pqry_a, pqry_b, node_distances, with_pendant_length );
154  dist *= pqry_a.multiplicity * pqry_b.multiplicity;
155  sum += dist;
156  }
157  }
158 
159  // Return normalized value.
161 }
162 
163 // =================================================================================================
164 // Variance
165 // =================================================================================================
166 
173 static double variance_partial_ (
174  PqueryPlain const& pqry_a,
175  std::vector<PqueryPlain> const& pqrys_b,
176  utils::Matrix<double> const& node_distances,
177  bool with_pendant_length
178 ) {
179  double partial = 0.0;
180 
181  for (const PqueryPlain& pqry_b : pqrys_b) {
182  // Check whether it is same pquery (a=b, nothing to do, as their distance is zero),
183  // or a pair of pqueries that was already calculated (a>b, skip it also).
184  if (pqry_a.index >= pqry_b.index) {
185  continue;
186  }
187  double dist = pquery_distance(pqry_a, pqry_b, node_distances, with_pendant_length);
188  dist *= pqry_a.multiplicity * pqry_b.multiplicity;
189  partial += dist * dist;
190  }
191 
192  return partial;
193 }
194 
195 double variance(
196  const Sample& smp,
197  bool with_pendant_length
198 ) {
199  // Init.
200  double variance = 0.0;
201 
202  // Create PqueryPlain objects for every placement and copy all interesting data into it.
203  // This way, we won't have to do all the pointer dereferencing during the actual calculations,
204  // and furthermore, the data is close in memory. This gives a tremendous speedup!
205  std::vector<PqueryPlain> vd_pqueries = plain_queries( smp );
206 
207  // Also, calculate a matrix containing the pairwise distance between all nodes. this way, we
208  // do not need to search a path between placements every time.
209  auto node_distances = node_branch_length_distance_matrix(smp.tree());
210 
211  // Do a pairwise calculation on all placements.
212  // int progress = 0;
213  #pragma omp parallel for
214  for( size_t i = 0; i < vd_pqueries.size(); ++i ) {
215  // LOG_PROG(++progress, vd_pqueries.size()) << "of Variance() finished.";
216 
217  PqueryPlain const& pqry_a = vd_pqueries[i];
218  variance += variance_partial_(pqry_a, vd_pqueries, node_distances, with_pendant_length);
219  }
220 
221  // Calculate normalizing factor. Should be the same value as given by placement_mass(),
222  // however this calculation is probably faster, as we already have the plain values at hand.
223  double mass = 0.0;
224  for( auto const& pqry : vd_pqueries ) {
225  for( auto const& place : pqry.placements ) {
226  mass += place.like_weight_ratio * pqry.multiplicity;
227  }
228  }
229 
230  // Return the normalized value.
231  return ((variance / mass) / mass);
232 }
233 
234 } // namespace placement
235 } // namespace genesis
genesis::placement::total_placement_mass_with_multiplicities
double total_placement_mass_with_multiplicities(Sample const &smp)
Get the mass of all PqueryPlacements of the Sample, using the multiplicities as factors.
Definition: masses.cpp:142
genesis::placement::Sample::tree
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:124
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::placement::Sample::size
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:138
genesis::placement::PqueryPlain
Simple POD struct that stores the information of a Pquery in a simple format for speeding up some cal...
Definition: plain.hpp:74
placement_tree.hpp
genesis::placement::pquery_distance
double pquery_distance(PqueryPlain const &pquery_a, PqueryPlain const &pquery_b, utils::Matrix< double > const &node_distances, bool with_pendant_length)
Calculate the weighted distance between two plain pqueries. It is mainly a helper method for distance...
Definition: placement/function/distances.cpp:51
genesis::placement::Sample
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on.
Definition: sample.hpp:68
tree.hpp
Header of Tree class.
tree_set.hpp
distances.hpp
tree_set.hpp
sample_set.hpp
genesis::placement::plain_queries
std::vector< PqueryPlain > plain_queries(Sample const &smp)
Return a plain representation of all pqueries of this map.
Definition: placement/function/helper.cpp:195
functions.hpp
Provides functions for working with Placements and Pqueries.
genesis::placement::PqueryPlain::multiplicity
double multiplicity
Definition: plain.hpp:77
genesis::utils::Matrix< double >
genesis::placement::pairwise_distance
double pairwise_distance(const Sample &smp_a, const Sample &smp_b, bool with_pendant_length)
Calculate the normalized pairwise distance between all placements of the two Samples.
Definition: measures.cpp:127
distances.hpp
Header of CommonTree distance methods.
genesis::placement::variance_partial_
static double variance_partial_(PqueryPlain const &pqry_a, std::vector< PqueryPlain > const &pqrys_b, utils::Matrix< double > const &node_distances, bool with_pendant_length)
Internal function that calculates the sum of distances contributed by one pquery for the variance....
Definition: measures.cpp:173
tree.hpp
genesis::placement::Pquery::placement_at
PqueryPlacement & placement_at(size_t index)
Return the PqueryPlacement at a certain index.
Definition: pquery.cpp:118
measures.hpp
Header for Placement Measures functions.
logging.hpp
Provides easy and fast logging functionality.
genesis::placement::Pquery
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
masses.hpp
genesis::placement::Sample::at
Pquery & at(size_t index)
Return the Pquery at a certain index.
Definition: sample.cpp:190
matrix.hpp
genesis::placement::placement_distance
double placement_distance(PqueryPlacement const &place_a, PqueryPlacement const &place_b, utils::Matrix< double > const &node_distances)
Calculate the distance between two PqueryPlacements, using their position on the tree::TreeEdges,...
Definition: placement/function/distances.cpp:157
genesis::placement::variance
double variance(const Sample &smp, bool with_pendant_length)
Calculate the variance of the placements on a tree.
Definition: measures.cpp:195
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
operators.hpp
Tree operator functions.
genesis::placement::PqueryPlain::index
size_t index
Definition: plain.hpp:76
genesis::placement::Pquery::placement_size
size_t placement_size() const
Return the number of PqueryPlacements stored in this Pquery.
Definition: pquery.cpp:113
operators.hpp
functions.hpp
options.hpp
plain.hpp
Header of PqueryPlain class.
operators.hpp
Matrix operators.
genesis::tree::node_branch_length_distance_matrix
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
Definition: tree/common_tree/distances.cpp:57
sample.hpp
postorder.hpp
genesis::placement::edpl
double edpl(Pquery const &pquery, utils::Matrix< double > const &node_distances)
Calculate the EDPL uncertainty values for a Pquery.
Definition: measures.cpp:75
distances.hpp
Header of Tree distance methods.
genesis::placement::compatible_trees
bool compatible_trees(PlacementTree const &lhs, PlacementTree const &rhs)
Return whether two PlacementTrees are compatible.
Definition: placement/function/operators.cpp:63
helper.hpp