A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
measures.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
41 
51 #include "genesis/tree/tree.hpp"
52 
57 
58 #include <algorithm>
59 #include <cassert>
60 #include <map>
61 #include <stdexcept>
62 
63 #ifdef GENESIS_OPENMP
64 # include <omp.h>
65 #endif
66 
67 #ifdef GENESIS_PTHREADS
68 # include <thread>
69 #endif
70 
71 namespace genesis {
72 namespace placement {
73 
74 // =================================================================================================
75 // Expected Distance between Placement Locations
76 // =================================================================================================
77 
78 double edpl( Pquery const& pquery, utils::Matrix<double> const& node_distances )
79 {
80  double result = 0.0;
81 
82  #pragma omp parallel for
83  for( size_t i = 0; i < pquery.placement_size(); ++i ) {
84 
85  #pragma omp parallel for
86  for( size_t j = i + 1; j < pquery.placement_size(); ++j ) {
87 
88  auto const& place_i = pquery.placement_at(i);
89  auto const& place_j = pquery.placement_at(j);
90 
91  auto const dist = placement_distance( place_i, place_j, node_distances );
92 
93  #pragma omp atomic
94  result += place_i.like_weight_ratio * place_j.like_weight_ratio * dist;
95  }
96  }
97  return 2 * result;
98 }
99 
100 std::vector<double> edpl( Sample const& sample, utils::Matrix<double> const& node_distances )
101 {
102  // Prepare result (facilitate copy elision).
103  auto result = std::vector<double>( sample.size(), 0 );
104 
105  // Fill result vector.
106  #pragma omp parallel for
107  for( size_t qi = 0; qi < sample.size(); ++qi ) {
108  auto const& pquery = sample.at( qi );
109  result[qi] = edpl( pquery, node_distances );
110  }
111  return result;
112 }
113 
114 double edpl( Sample const& sample, Pquery const& pquery )
115 {
116  auto const node_distances = node_branch_length_distance_matrix( sample.tree() );
117  return edpl( pquery, node_distances );
118 }
119 
120 std::vector<double> edpl( Sample const& sample )
121 {
122  auto const node_distances = node_branch_length_distance_matrix( sample.tree() );
123  return edpl( sample, node_distances );
124 }
125 
126 // =================================================================================================
127 // Pairwise Distance
128 // =================================================================================================
129 
131  const Sample& smp_a,
132  const Sample& smp_b,
133  bool with_pendant_length
134 ) {
135  if (!compatible_trees(smp_a, smp_b)) {
136  throw std::invalid_argument("__FUNCTION__: Incompatible trees.");
137  }
138 
139  // Init.
140  double sum = 0.0;
141 
142  // Create PqueryPlain objects for every placement and copy all interesting data into it.
143  // This way, we won't have to do all the pointer dereferencing during the actual calculations,
144  // and furthermore, the data is close in memory. this gives a tremendous speedup!
145  std::vector<PqueryPlain> const pqueries_a = plain_queries( smp_a );
146  std::vector<PqueryPlain> const pqueries_b = plain_queries( smp_b );
147 
148  // Calculate a matrix containing the pairwise distance between all nodes. This way, we
149  // do not need to search a path between placements every time. We use the tree of the first smp
150  // here, ignoring branch lengths on tree b.
151  // FIXME this might be made better by using average or so in the future.
152  auto node_distances = node_branch_length_distance_matrix(smp_a.tree());
153 
154  for (const PqueryPlain& pqry_a : pqueries_a) {
155  for (const PqueryPlain& pqry_b : pqueries_b) {
156  sum += pquery_distance(pqry_a, pqry_b, node_distances, with_pendant_length);
157  }
158  }
159 
160  // Return normalized value.
161  return sum / total_placement_mass( smp_a ) / total_placement_mass( smp_b );
162 }
163 
164 // =================================================================================================
165 // Variance
166 // =================================================================================================
167 
176  const PqueryPlain& pqry_a,
177  const std::vector<PqueryPlain>& pqrys_b,
178  const utils::Matrix<double>& node_distances,
179  bool with_pendant_length
180 ) {
181  double partial = 0.0;
182 
183  for (const PqueryPlain& pqry_b : pqrys_b) {
184  // Check whether it is same pquery (a=b, nothing to do, as their distance is zero),
185  // or a pair of pqueries that was already calculated (a>b, skip it also).
186  if (pqry_a.index >= pqry_b.index) {
187  continue;
188  }
189  double const dist = pquery_distance(pqry_a, pqry_b, node_distances, with_pendant_length);
190  partial += dist * dist;
191  }
192 
193  return partial;
194 }
195 
205  const int offset,
206  const int incr,
207  const std::vector<PqueryPlain>* pqrys,
208  const utils::Matrix<double>* node_distances,
209  double* partial,
210  bool with_pendant_length
211 ) {
212  // Use internal variables to avoid false sharing.
213  assert( partial && *partial == 0.0 );
214  double tmp_partial = 0.0;
215 
216  // Iterate over the pqueries, starting at offset and interleaved with incr for each thread.
217  for (size_t i = offset; i < pqrys->size(); i += incr) {
218  // LOG_PROG(i, pqrys->size()) << "of Variance() finished (Thread " << offset << ").";
219 
220  PqueryPlain const& pqry_a = (*pqrys)[i];
221  tmp_partial += variance_partial(pqry_a, *pqrys, *node_distances, with_pendant_length);
222  }
223 
224  // Return the results.
225  *partial = tmp_partial;
226 }
227 
228 double variance(
229  const Sample& smp,
230  bool with_pendant_length
231 ) {
232  // Init.
233  double variance = 0.0;
234 
235  // Create PqueryPlain objects for every placement and copy all interesting data into it.
236  // This way, we won't have to do all the pointer dereferencing during the actual calculations,
237  // and furthermore, the data is close in memory. This gives a tremendous speedup!
238  std::vector<PqueryPlain> vd_pqueries = plain_queries( smp );
239 
240  // Also, calculate a matrix containing the pairwise distance between all nodes. this way, we
241  // do not need to search a path between placements every time.
242  auto node_distances = node_branch_length_distance_matrix(smp.tree());
243 
244 #ifdef GENESIS_PTHREADS
245 
246  // Prepare storage for thread data.
247  int num_threads = utils::Options::get().number_of_threads();
248  std::vector<double> partials(num_threads, 0.0);
249  std::vector<std::thread> threads;
250 
251  // Start all threads.
252  for (int i = 0; i < num_threads; ++i) {
253  threads.emplace_back(
255  i, num_threads, &vd_pqueries, &node_distances,
256  &partials[i],
257  with_pendant_length
258  );
259  // threads[i] = new std::thread ();
260  }
261 
262  // Wait for all threads to finish, collect their results.
263  for (int i = 0; i < num_threads; ++i) {
264  threads[i].join();
265  variance += partials[i];
266  }
267 
268 #else
269 
270  // Do a pairwise calculation on all placements.
271  // int progress = 0;
272  for (const PqueryPlain& pqry_a : vd_pqueries) {
273  // LOG_PROG(++progress, vd_pqueries.size()) << "of Variance() finished.";
274  variance += variance_partial(pqry_a, vd_pqueries, node_distances, with_pendant_length);
275  }
276 
277 #endif
278 
279  // Calculate normalizing factor. Should be the same value as given by placement_mass(),
280  // however this calculation is probably faster, as we already have the plain values at hand.
281  double mass = 0.0;
282  for (const auto& pqry : vd_pqueries) {
283  for (const auto& place : pqry.placements) {
284  mass += place.like_weight_ratio;
285  }
286  }
287 
288  // Return the normalized value.
289  return ((variance / mass) / mass);
290 }
291 
292 } // namespace placement
293 } // namespace genesis
void offset(Histogram &h, double value)
Definition: operations.cpp:47
double pquery_distance(PqueryPlain const &pquery_a, PqueryPlain const &pquery_b, utils::Matrix< double > const &node_distances, bool with_pendant_length)
Calculate the weighted distance between two plain pqueries. It is mainly a helper method for distance...
std::vector< PqueryPlain > plain_queries(Sample const &smp)
Return a plain representation of all pqueries of this map.
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:133
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
double pairwise_distance(const Sample &smp_a, const Sample &smp_b, bool with_pendant_length)
Calculate the normalized pairwise distance between all placements of the two Samples.
Definition: measures.cpp:130
double edpl(Pquery const &pquery, utils::Matrix< double > const &node_distances)
Calculate the EDPL uncertainty values for a Pquery.
Definition: measures.cpp:78
bool compatible_trees(PlacementTree const &lhs, PlacementTree const &rhs)
Return whether two PlacementTrees are compatible.
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
Tree operator functions.
Pquery & at(size_t index)
Return the Pquery at a certain index.
Definition: sample.cpp:185
unsigned int number_of_threads() const
Returns the number of threads.
Definition: options.hpp:97
Provides functions for working with Placements and Pqueries.
double variance(const Sample &smp, bool with_pendant_length)
Calculate the variance of the placements on a tree.
Definition: measures.cpp:228
double sum(const Histogram &h)
PqueryPlacement & placement_at(size_t index)
Return the PqueryPlacement at a certain index.
Definition: pquery.cpp:118
Matrix operators.
size_t placement_size() const
Return the number of PqueryPlacements stored in this Pquery.
Definition: pquery.cpp:113
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
double variance_partial(const PqueryPlain &pqry_a, const std::vector< PqueryPlain > &pqrys_b, const utils::Matrix< double > &node_distances, bool with_pendant_length)
Internal function that calculates the sum of distances contributed by one pquery for the variance...
Definition: measures.cpp:175
Header of Default Tree distance methods.
Provides easy and fast logging functionality.
Header of PqueryPlain class.
void variance_thread(const int offset, const int incr, const std::vector< PqueryPlain > *pqrys, const utils::Matrix< double > *node_distances, double *partial, bool with_pendant_length)
Internal function that calculates the sum of distances for the variance that is contributed by a subs...
Definition: measures.cpp:204
double placement_distance(PqueryPlacement const &place_a, PqueryPlacement const &place_b, utils::Matrix< double > const &node_distances)
Calculate the distance between two PqueryPlacements, using their positin on the tree::TreeEdges, measured in branch length units.
double total_placement_mass(Sample const &smp)
Get the summed mass of all PqueryPlacements in all Pqueries of the given Sample, where mass is measu...
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
Header of Tree class.
Header of Tree distance methods.
static Options & get()
Returns a single instance of this class.
Definition: options.hpp:59
Simple POD struct that stores the information of a Pquery in a simple format for speeding up some cal...
Definition: plain.hpp:74
Header for Placement Measures functions.