A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
measures.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
42 
52 #include "genesis/tree/tree.hpp"
53 
58 
59 #include <algorithm>
60 #include <cassert>
61 #include <map>
62 #include <stdexcept>
63 
64 #ifdef GENESIS_OPENMP
65 # include <omp.h>
66 #endif
67 
68 #ifdef GENESIS_PTHREADS
69 # include <thread>
70 #endif
71 
72 namespace genesis {
73 namespace placement {
74 
75 // =================================================================================================
76 // Expected Distance between Placement Locations
77 // =================================================================================================
78 
79 double edpl( Pquery const& pquery, utils::Matrix<double> const& node_distances )
80 {
81  double result = 0.0;
82 
83  #pragma omp parallel for
84  for( size_t i = 0; i < pquery.placement_size(); ++i ) {
85 
86  #pragma omp parallel for
87  for( size_t j = i + 1; j < pquery.placement_size(); ++j ) {
88 
89  auto const& place_i = pquery.placement_at(i);
90  auto const& place_j = pquery.placement_at(j);
91 
92  auto const dist = placement_distance( place_i, place_j, node_distances );
93 
94  #pragma omp atomic
95  result += place_i.like_weight_ratio * place_j.like_weight_ratio * dist;
96  }
97  }
98  return 2 * result;
99 }
100 
101 std::vector<double> edpl( Sample const& sample, utils::Matrix<double> const& node_distances )
102 {
103  // Prepare result (facilitate copy elision).
104  auto result = std::vector<double>( sample.size(), 0 );
105 
106  // Fill result vector.
107  #pragma omp parallel for
108  for( size_t qi = 0; qi < sample.size(); ++qi ) {
109  auto const& pquery = sample.at( qi );
110  result[qi] = edpl( pquery, node_distances );
111  }
112  return result;
113 }
114 
115 double edpl( Sample const& sample, Pquery const& pquery )
116 {
117  auto const node_distances = node_branch_length_distance_matrix( sample.tree() );
118  return edpl( pquery, node_distances );
119 }
120 
121 std::vector<double> edpl( Sample const& sample )
122 {
123  auto const node_distances = node_branch_length_distance_matrix( sample.tree() );
124  return edpl( sample, node_distances );
125 }
126 
127 // =================================================================================================
128 // Pairwise Distance
129 // =================================================================================================
130 
132  const Sample& smp_a,
133  const Sample& smp_b,
134  bool with_pendant_length
135 ) {
136  if (!compatible_trees(smp_a, smp_b)) {
137  throw std::invalid_argument("pairwise_distance: Incompatible trees.");
138  }
139 
140  // Init.
141  double sum = 0.0;
142 
143  // Create PqueryPlain objects for every placement and copy all interesting data into it.
144  // This way, we won't have to do all the pointer dereferencing during the actual calculations,
145  // and furthermore, the data is close in memory. this gives a tremendous speedup!
146  std::vector<PqueryPlain> const pqueries_a = plain_queries( smp_a );
147  std::vector<PqueryPlain> const pqueries_b = plain_queries( smp_b );
148 
149  // Calculate a matrix containing the pairwise distance between all nodes. This way, we
150  // do not need to search a path between placements every time. We use the tree of the first smp
151  // here, ignoring branch lengths on tree b.
152  // FIXME this might be made better by using average or so in the future.
153  auto node_distances = node_branch_length_distance_matrix(smp_a.tree());
154 
155  for (const PqueryPlain& pqry_a : pqueries_a) {
156  for (const PqueryPlain& pqry_b : pqueries_b) {
157  auto dist = pquery_distance( pqry_a, pqry_b, node_distances, with_pendant_length );
158  dist *= pqry_a.multiplicity * pqry_b.multiplicity;
159  sum += dist;
160  }
161  }
162 
163  // Return normalized value.
165 }
166 
167 // =================================================================================================
168 // Variance
169 // =================================================================================================
170 
179  const PqueryPlain& pqry_a,
180  const std::vector<PqueryPlain>& pqrys_b,
181  const utils::Matrix<double>& node_distances,
182  bool with_pendant_length
183 ) {
184  double partial = 0.0;
185 
186  for (const PqueryPlain& pqry_b : pqrys_b) {
187  // Check whether it is same pquery (a=b, nothing to do, as their distance is zero),
188  // or a pair of pqueries that was already calculated (a>b, skip it also).
189  if (pqry_a.index >= pqry_b.index) {
190  continue;
191  }
192  double dist = pquery_distance(pqry_a, pqry_b, node_distances, with_pendant_length);
193  dist *= pqry_a.multiplicity * pqry_b.multiplicity;
194  partial += dist * dist;
195  }
196 
197  return partial;
198 }
199 
209  const int offset,
210  const int incr,
211  const std::vector<PqueryPlain>* pqrys,
212  const utils::Matrix<double>* node_distances,
213  double* partial,
214  bool with_pendant_length
215 ) {
216  // Use internal variables to avoid false sharing.
217  assert( partial && *partial == 0.0 );
218  double tmp_partial = 0.0;
219 
220  // Iterate over the pqueries, starting at offset and interleaved with incr for each thread.
221  for (size_t i = offset; i < pqrys->size(); i += incr) {
222  // LOG_PROG(i, pqrys->size()) << "of Variance() finished (Thread " << offset << ").";
223 
224  PqueryPlain const& pqry_a = (*pqrys)[i];
225  tmp_partial += variance_partial(pqry_a, *pqrys, *node_distances, with_pendant_length);
226  }
227 
228  // Return the results.
229  *partial = tmp_partial;
230 }
231 
232 double variance(
233  const Sample& smp,
234  bool with_pendant_length
235 ) {
236  // Init.
237  double variance = 0.0;
238 
239  // Create PqueryPlain objects for every placement and copy all interesting data into it.
240  // This way, we won't have to do all the pointer dereferencing during the actual calculations,
241  // and furthermore, the data is close in memory. This gives a tremendous speedup!
242  std::vector<PqueryPlain> vd_pqueries = plain_queries( smp );
243 
244  // Also, calculate a matrix containing the pairwise distance between all nodes. this way, we
245  // do not need to search a path between placements every time.
246  auto node_distances = node_branch_length_distance_matrix(smp.tree());
247 
248 #ifdef GENESIS_PTHREADS
249 
250  // Prepare storage for thread data.
251  int num_threads = utils::Options::get().number_of_threads();
252  std::vector<double> partials(num_threads, 0.0);
253  std::vector<std::thread> threads;
254 
255  // Start all threads.
256  for (int i = 0; i < num_threads; ++i) {
257  threads.emplace_back(
259  i, num_threads, &vd_pqueries, &node_distances,
260  &partials[i],
261  with_pendant_length
262  );
263  // threads[i] = new std::thread ();
264  }
265 
266  // Wait for all threads to finish, collect their results.
267  for (int i = 0; i < num_threads; ++i) {
268  threads[i].join();
269  variance += partials[i];
270  }
271 
272 #else
273 
274  // Do a pairwise calculation on all placements.
275  // int progress = 0;
276  for (const PqueryPlain& pqry_a : vd_pqueries) {
277  // LOG_PROG(++progress, vd_pqueries.size()) << "of Variance() finished.";
278  variance += variance_partial(pqry_a, vd_pqueries, node_distances, with_pendant_length);
279  }
280 
281 #endif
282 
283  // Calculate normalizing factor. Should be the same value as given by placement_mass(),
284  // however this calculation is probably faster, as we already have the plain values at hand.
285  double mass = 0.0;
286  for (const auto& pqry : vd_pqueries) {
287  for (const auto& place : pqry.placements) {
288  mass += place.like_weight_ratio * pqry.multiplicity;
289  }
290  }
291 
292  // Return the normalized value.
293  return ((variance / mass) / mass);
294 }
295 
296 } // namespace placement
297 } // namespace genesis
void offset(Histogram &h, double value)
Definition: operations.cpp:47
double pquery_distance(PqueryPlain const &pquery_a, PqueryPlain const &pquery_b, utils::Matrix< double > const &node_distances, bool with_pendant_length)
Calculate the weighted distance between two plain pqueries. It is mainly a helper method for distance...
std::vector< PqueryPlain > plain_queries(Sample const &smp)
Return a plain representation of all pqueries of this map.
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:133
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
double pairwise_distance(const Sample &smp_a, const Sample &smp_b, bool with_pendant_length)
Calculate the normalized pairwise distance between all placements of the two Samples.
Definition: measures.cpp:131
double edpl(Pquery const &pquery, utils::Matrix< double > const &node_distances)
Calculate the EDPL uncertainty values for a Pquery.
Definition: measures.cpp:79
bool compatible_trees(PlacementTree const &lhs, PlacementTree const &rhs)
Return whether two PlacementTrees are compatible.
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
Tree operator functions.
Pquery & at(size_t index)
Return the Pquery at a certain index.
Definition: sample.cpp:185
unsigned int number_of_threads() const
Returns the number of threads.
Definition: options.hpp:98
Provides functions for working with Placements and Pqueries.
double variance(const Sample &smp, bool with_pendant_length)
Calculate the variance of the placements on a tree.
Definition: measures.cpp:232
double sum(const Histogram &h)
PqueryPlacement & placement_at(size_t index)
Return the PqueryPlacement at a certain index.
Definition: pquery.cpp:118
Matrix operators.
size_t placement_size() const
Return the number of PqueryPlacements stored in this Pquery.
Definition: pquery.cpp:113
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
double total_placement_mass_with_multiplicities(Sample const &smp)
Get the mass of all PqueryPlacements of the Sample, using the multiplicities as factors.
Definition: masses.cpp:142
double variance_partial(const PqueryPlain &pqry_a, const std::vector< PqueryPlain > &pqrys_b, const utils::Matrix< double > &node_distances, bool with_pendant_length)
Internal function that calculates the sum of distances contributed by one pquery for the variance...
Definition: measures.cpp:178
Header of Default Tree distance methods.
Provides easy and fast logging functionality.
Header of PqueryPlain class.
void variance_thread(const int offset, const int incr, const std::vector< PqueryPlain > *pqrys, const utils::Matrix< double > *node_distances, double *partial, bool with_pendant_length)
Internal function that calculates the sum of distances for the variance that is contributed by a subs...
Definition: measures.cpp:208
double placement_distance(PqueryPlacement const &place_a, PqueryPlacement const &place_b, utils::Matrix< double > const &node_distances)
Calculate the distance between two PqueryPlacements, using their positin on the tree::TreeEdges, measured in branch length units.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
Header of Tree class.
Header of Tree distance methods.
static Options & get()
Returns a single instance of this class.
Definition: options.hpp:60
Simple POD struct that stores the information of a Pquery in a simple format for speeding up some cal...
Definition: plain.hpp:74
Header for Placement Measures functions.