A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
measures.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
40 
50 #include "genesis/tree/tree.hpp"
51 
56 
57 #include <algorithm>
58 #include <cassert>
59 #include <map>
60 #include <stdexcept>
61 
62 #ifdef GENESIS_OPENMP
63 # include <omp.h>
64 #endif
65 
66 #ifdef GENESIS_PTHREADS
67 # include <thread>
68 #endif
69 
70 namespace genesis {
71 namespace placement {
72 
73 // =================================================================================================
74 // Helper Method pquery_distance
75 // =================================================================================================
76 
78  const PqueryPlain& pqry_a,
79  const PqueryPlain& pqry_b,
80  const utils::Matrix<double>& node_distances,
81  bool with_pendant_length
82 ) {
83  double sum = 0.0;
84  double pp, pd, dp, dist;
85 
86  // TODO once the proper dist measures (outside of this class) are in place, get rid of this function!
87 
88  for (const PqueryPlacementPlain& place_a : pqry_a.placements) {
89  for (const PqueryPlacementPlain& place_b : pqry_b.placements) {
90  if (place_a.edge_index == place_b.edge_index) {
91  // same branch case
92  dist = std::abs(place_a.proximal_length - place_b.proximal_length);
93 
94  } else {
95  // proximal-proximal case
96  pp = place_a.proximal_length
97  + node_distances(place_a.primary_node_index, place_b.primary_node_index)
98  + place_b.proximal_length;
99 
100  // proximal-distal case
101  pd = place_a.proximal_length
102  + node_distances(place_a.primary_node_index, place_b.secondary_node_index)
103  + place_b.branch_length - place_b.proximal_length;
104 
105  // distal-proximal case
106  dp = place_a.branch_length - place_a.proximal_length
107  + node_distances(place_a.secondary_node_index, place_b.primary_node_index)
108  + place_b.proximal_length;
109 
110  // find min of the three cases and
111  dist = std::min(pp, std::min(pd, dp));
112  }
113 
114  // If needed, use pendant length; normalize it to the weight ratios.
115  if (with_pendant_length) {
116  dist += place_a.pendant_length + place_b.pendant_length;
117  }
118  dist *= place_a.like_weight_ratio * place_b.like_weight_ratio;
119  sum += dist;
120  }
121  }
122 
123  return sum;
124 }
125 
127  PqueryPlacement const& place_a,
128  PqueryPlacement const& place_b,
129  utils::Matrix<double> const& node_distances
130 ) {
131  double pp, pd, dp, dist;
132 
133  if( place_a.edge().index() == place_b.edge().index() ) {
134  // same branch case
135  dist = std::abs( place_a.proximal_length - place_b.proximal_length );
136 
137  } else {
138  // proximal-proximal case
139  pp = place_a.proximal_length
140  + node_distances(
141  place_a.edge().primary_node().index(),
142  place_b.edge().primary_node().index()
143  )
144  + place_b.proximal_length;
145 
146  // proximal-distal case
147  pd = place_a.proximal_length
148  + node_distances(
149  place_a.edge().primary_node().index(),
150  place_b.edge().secondary_node().index()
151  )
152  + place_b.edge().data<tree::DefaultEdgeData>().branch_length
153  - place_b.proximal_length;
154 
155  // distal-proximal case
156  dp = place_a.edge().data<tree::DefaultEdgeData>().branch_length
157  - place_a.proximal_length
158  + node_distances(
159  place_a.edge().secondary_node().index(),
160  place_b.edge().primary_node().index()
161  )
162  + place_b.proximal_length;
163 
164  // find min of the three cases
165  dist = std::min(pp, std::min(pd, dp));
166  }
167 
168  return dist;
169 }
170 
171 // =================================================================================================
172 // Expected Distance between Placement Locations
173 // =================================================================================================
174 
176  Pquery const& pquery,
177  utils::Matrix<double> const& node_distances
178 ) {
179  double result = 0.0;
180 
181  for( size_t i = 0; i < pquery.placement_size(); ++i ) {
182  for( size_t j = i + 1; j < pquery.placement_size(); ++j ) {
183  auto const& place_i = pquery.placement_at(i);
184  auto const& place_j = pquery.placement_at(j);
185 
186  auto const dist = placement_distance( place_i, place_j, node_distances );
187  result += place_i.like_weight_ratio * place_j.like_weight_ratio * dist;
188  }
189  }
190  return 2 * result;
191 }
192 
193 double expected_distance_between_placement_locations( Sample const& sample, Pquery const& pquery )
194 {
195  auto node_distances = node_branch_length_distance_matrix( sample.tree() );
196  return expected_distance_between_placement_locations( pquery, node_distances );
197 }
198 
199 double edpl( Sample const& sample, Pquery const& pquery )
200 {
201  return expected_distance_between_placement_locations( sample, pquery );
202 }
203 
204 std::vector<double> expected_distance_between_placement_locations( Sample const& sample )
205 {
206  // Prepare result (facilitate copy elision).
207  std::vector<double> result;
208  result.reserve( sample.size() );
209 
210  // Get pairwise dists between all nodes of the tree.
211  auto const node_distances = node_branch_length_distance_matrix( sample.tree() );
212 
213  // Fill result vector.
214  for( auto const& pquery : sample ) {
215  result.push_back( expected_distance_between_placement_locations( pquery, node_distances ));
216  }
217  return result;
218 }
219 
220 std::vector<double> edpl( Sample const& sample )
221 {
223 }
224 
225 // =================================================================================================
226 // Pairwise Distance
227 // =================================================================================================
228 
230  const Sample& smp_a,
231  const Sample& smp_b,
232  bool with_pendant_length
233 ) {
234  if (!compatible_trees(smp_a, smp_b)) {
235  throw std::invalid_argument("__FUNCTION__: Incompatible trees.");
236  }
237 
238  // Init.
239  double sum = 0.0;
240 
241  // Create PqueryPlain objects for every placement and copy all interesting data into it.
242  // This way, we won't have to do all the pointer dereferencing during the actual calculations,
243  // and furthermore, the data is close in memory. this gives a tremendous speedup!
244  std::vector<PqueryPlain> const pqueries_a = plain_queries( smp_a );
245  std::vector<PqueryPlain> const pqueries_b = plain_queries( smp_b );
246 
247  // Calculate a matrix containing the pairwise distance between all nodes. This way, we
248  // do not need to search a path between placements every time. We use the tree of the first smp
249  // here, ignoring branch lengths on tree b.
250  // FIXME this might be made better by using average or so in the future.
251  auto node_distances = node_branch_length_distance_matrix(smp_a.tree());
252 
253  for (const PqueryPlain& pqry_a : pqueries_a) {
254  for (const PqueryPlain& pqry_b : pqueries_b) {
255  sum += pquery_distance(pqry_a, pqry_b, node_distances, with_pendant_length);
256  }
257  }
258 
259  // Return normalized value.
260  return sum / total_placement_mass( smp_a ) / total_placement_mass( smp_b );
261 }
262 
263 // =================================================================================================
264 // Variance
265 // =================================================================================================
266 
275  const PqueryPlain& pqry_a,
276  const std::vector<PqueryPlain>& pqrys_b,
277  const utils::Matrix<double>& node_distances,
278  bool with_pendant_length
279 ) {
280  double partial = 0.0;
281 
282  for (const PqueryPlain& pqry_b : pqrys_b) {
283  // Check whether it is same pquery (a=b, nothing to do, as their distance is zero),
284  // or a pair of pqueries that was already calculated (a>b, skip it also).
285  if (pqry_a.index >= pqry_b.index) {
286  continue;
287  }
288  double const dist = pquery_distance(pqry_a, pqry_b, node_distances, with_pendant_length);
289  partial += dist * dist;
290  }
291 
292  return partial;
293 }
294 
304  const int offset,
305  const int incr,
306  const std::vector<PqueryPlain>* pqrys,
307  const utils::Matrix<double>* node_distances,
308  double* partial,
309  bool with_pendant_length
310 ) {
311  // Use internal variables to avoid false sharing.
312  assert( partial && *partial == 0.0 );
313  double tmp_partial = 0.0;
314 
315  // Iterate over the pqueries, starting at offset and interleaved with incr for each thread.
316  for (size_t i = offset; i < pqrys->size(); i += incr) {
317  // LOG_PROG(i, pqrys->size()) << "of Variance() finished (Thread " << offset << ").";
318 
319  PqueryPlain const& pqry_a = (*pqrys)[i];
320  tmp_partial += variance_partial(pqry_a, *pqrys, *node_distances, with_pendant_length);
321  }
322 
323  // Return the results.
324  *partial = tmp_partial;
325 }
326 
327 double variance(
328  const Sample& smp,
329  bool with_pendant_length
330 ) {
331  // Init.
332  double variance = 0.0;
333 
334  // Create PqueryPlain objects for every placement and copy all interesting data into it.
335  // This way, we won't have to do all the pointer dereferencing during the actual calculations,
336  // and furthermore, the data is close in memory. This gives a tremendous speedup!
337  std::vector<PqueryPlain> vd_pqueries = plain_queries( smp );
338 
339  // Also, calculate a matrix containing the pairwise distance between all nodes. this way, we
340  // do not need to search a path between placements every time.
341  auto node_distances = node_branch_length_distance_matrix(smp.tree());
342 
343 #ifdef GENESIS_PTHREADS
344 
345  // Prepare storage for thread data.
346  int num_threads = utils::Options::get().number_of_threads();
347  std::vector<double> partials(num_threads, 0.0);
348  std::vector<std::thread> threads;
349 
350  // Start all threads.
351  for (int i = 0; i < num_threads; ++i) {
352  threads.emplace_back(
354  i, num_threads, &vd_pqueries, &node_distances,
355  &partials[i],
356  with_pendant_length
357  );
358  // threads[i] = new std::thread ();
359  }
360 
361  // Wait for all threads to finish, collect their results.
362  for (int i = 0; i < num_threads; ++i) {
363  threads[i].join();
364  variance += partials[i];
365  }
366 
367 #else
368 
369  // Do a pairwise calculation on all placements.
370  // int progress = 0;
371  for (const PqueryPlain& pqry_a : vd_pqueries) {
372  // LOG_PROG(++progress, vd_pqueries.size()) << "of Variance() finished.";
373  variance += variance_partial(pqry_a, vd_pqueries, node_distances, with_pendant_length);
374  }
375 
376 #endif
377 
378  // Calculate normalizing factor. Should be the same value as given by placement_mass(),
379  // however this calculation is probably faster, as we already have the plain values at hand.
380  double mass = 0.0;
381  for (const auto& pqry : vd_pqueries) {
382  for (const auto& place : pqry.placements) {
383  mass += place.like_weight_ratio;
384  }
385  }
386 
387  // Return the normalized value.
388  return ((variance / mass) / mass);
389 }
390 
391 } // namespace placement
392 } // namespace genesis
void offset(Histogram &h, double value)
Definition: operations.cpp:47
size_t index() const
Return the index of this Edge.
Definition: edge.cpp:45
std::vector< PqueryPlain > plain_queries(Sample const &smp)
Return a plain representation of all pqueries of this map.
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:133
double proximal_length
Distance of this placement to the next node towards the root.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
double pairwise_distance(const Sample &smp_a, const Sample &smp_b, bool with_pendant_length)
Calculate the normalized pairwise distance between all placements of the two Samples.
Definition: measures.cpp:229
bool compatible_trees(PlacementTree const &lhs, PlacementTree const &rhs)
Return whether two PlacementTrees are compatible.
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
Tree operator functions.
const PlacementTreeEdge & edge() const
Get the PlacementTreeEdge where this PqueryPlacement is placed.
Definition: placement.cpp:58
unsigned int number_of_threads() const
Returns the number of threads.
Definition: options.hpp:97
Provides functions for working with Placements and Pqueries.
One placement position of a Pquery on a Tree.
double variance(const Sample &smp, bool with_pendant_length)
Calculate the variance of the placements on a tree.
Definition: measures.cpp:327
double sum(const Histogram &h)
PqueryPlacement & placement_at(size_t index)
Return the PqueryPlacement at a certain index.
Definition: pquery.cpp:164
double pquery_distance(const PqueryPlain &pqry_a, const PqueryPlain &pqry_b, const utils::Matrix< double > &node_distances, bool with_pendant_length)
Calculates the normalized distance between two plain pqueries. It is mainly a helper method for dista...
Definition: measures.cpp:77
size_t index() const
Return the index of this Node.
Definition: node.cpp:48
Default class containing the commonly needed data for tree edges.
size_t placement_size() const
Return the number of PqueryPlacements stored in this Pquery.
Definition: pquery.cpp:154
Simple POD struct for a Placement used for speeding up some calculations.
Definition: plain.hpp:51
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
TreeNode & secondary_node()
Return the TreeNode of this TreeEdge that points away from the root.
Definition: edge.cpp:101
double variance_partial(const PqueryPlain &pqry_a, const std::vector< PqueryPlain > &pqrys_b, const utils::Matrix< double > &node_distances, bool with_pendant_length)
Internal function that calculates the sum of distances contributed by one pquery for the variance...
Definition: measures.cpp:274
Header of Default Tree distance methods.
Provides easy and fast logging functionality.
Header of PqueryPlain class.
void variance_thread(const int offset, const int incr, const std::vector< PqueryPlain > *pqrys, const utils::Matrix< double > *node_distances, double *partial, bool with_pendant_length)
Internal function that calculates the sum of distances for the variance that is contributed by a subs...
Definition: measures.cpp:303
double edpl(Sample const &sample, Pquery const &pquery)
Shortcut alias for expected_distance_between_placement_locations().
Definition: measures.cpp:199
double placement_distance(PqueryPlacement const &place_a, PqueryPlacement const &place_b, utils::Matrix< double > const &node_distances)
Definition: measures.cpp:126
double total_placement_mass(Sample const &smp)
Get the summed mass of all PqueryPlacements in all Pqueries of the given Sample, where mass is measu...
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
Matrix operators.
TreeNode & primary_node()
Return the TreeNode of this TreeEdge that points towards the root.
Definition: edge.cpp:85
Header of Tree class.
Header of Tree distance methods.
std::vector< PqueryPlacementPlain > placements
Definition: plain.hpp:77
double expected_distance_between_placement_locations(Pquery const &pquery, utils::Matrix< double > const &node_distances)
Calculate the EDPL uncertainty values for a Pquery.
Definition: measures.cpp:175
EdgeDataType & data()
Definition: edge.hpp:118
static Options & get()
Returns a single instance of this class.
Definition: options.hpp:59
Simple POD struct that stores the information of a Pquery in a simple format for speeding up some cal...
Definition: plain.hpp:74
Header for Placement Measures functions.