A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
nhd.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
42 
50 
51 #include <cassert>
52 
53 #ifdef GENESIS_OPENMP
54 # include <omp.h>
55 #endif
56 
57 namespace genesis {
58 namespace placement {
59 
60 // =================================================================================================
61 // Local Helper Functions
62 // =================================================================================================
63 
64 // -------------------------------------------------------------------------------------------------
65 // make_empty_node_distance_histograms
66 // -------------------------------------------------------------------------------------------------
67 
71 std::vector< utils::Histogram > make_empty_node_distance_histograms (
72  tree::Tree const& tree,
73  utils::Matrix<double> const& node_distances,
74  utils::Matrix<signed char> const& node_sides,
75  size_t const histogram_bins
76 ) {
77  auto const node_count = tree.node_count();
78  if( tree.empty() ) {
79  throw std::runtime_error( "Tree is empty. Cannot use Node Histogram Distance." );
80  }
81  if( node_distances.rows() != node_count || node_distances.cols() != node_count ) {
82  throw std::runtime_error( "Node Distance Matrix has wrong size." );
83  }
84  if( node_sides.rows() != node_count || node_sides.cols() != node_count ) {
85  throw std::runtime_error( "Node Sides Matrix has wrong size." );
86  }
87 
88  // Prepare a vector of histograms for each node of the tree.
89  // We init with dummy histograms, so that we can better parallelize later...
90  // Stupid, but okay for now.
91  auto histograms = std::vector< utils::Histogram >();
92  histograms.resize( node_count, utils::Histogram( histogram_bins ));
93 
94  // Make histograms that have enough room on both sides.
95  #pragma omp parallel for
96  for( size_t node_idx = 0; node_idx < node_count; ++node_idx ) {
97 
98  // Find furthest nodes on root and non-root sides.
99  // For now, we use both positive values, and later reverse the sign of the min entry.
100  double min = 0.0;
101  double max = 0.0;
102  for( size_t col_idx = 0; col_idx < node_count; ++col_idx ) {
103 
104  // No need to get a distance from the node to itself.
105  if( col_idx == node_idx ) {
106  // assert( node_sides( node_idx, col_idx ) == 0 );
107  continue;
108  }
109 
110  // Get distance between nodes.
111  auto const dist = node_distances( node_idx, col_idx );
112 
113  // If we are at a node that is on the root side, optimize max.
114  if( node_sides( node_idx, col_idx ) == 1 && dist > max ) {
115  max = dist;
116  }
117 
118  // If we are at a node that is on a non-root side, optimize min.
119  if( node_sides( node_idx, col_idx ) == -1 && dist > min ) {
120  min = dist;
121  }
122  }
123 
124  // If this fails, the tree is not usable.
125  if( min == 0.0 && max == 0.0 ) {
126  throw std::runtime_error(
127  "Tree only has branch lengths with value 0. Cannot use Node Histogram Distance."
128  );
129  }
130 
131  histograms[node_idx] = utils::Histogram( histogram_bins, -min, max );
132  }
133 
134  return histograms;
135 }
136 
137 // -------------------------------------------------------------------------------------------------
138 // fill_node_distance_histograms
139 // -------------------------------------------------------------------------------------------------
140 
145  Sample const& sample,
146  utils::Matrix<double> const& node_distances,
147  utils::Matrix<signed char> const& node_sides,
148  std::vector< utils::Histogram >& histograms
149 ) {
150  auto const node_count = sample.tree().node_count();
151  if( histograms.size() != node_count ) {
152  throw std::runtime_error( "Number of histograms does not equal number of tree nodes." );
153  }
154  if( node_distances.rows() != node_count || node_distances.cols() != node_count ) {
155  throw std::runtime_error( "Node Distance Matrix has wrong size." );
156  }
157  if( node_sides.rows() != node_count || node_sides.cols() != node_count ) {
158  throw std::runtime_error( "Node Sides Matrix has wrong size." );
159  }
160 
161  // We use this order of loops, as it should be faster to touch each pquery once, and
162  // instead update the histograms multiple times, instead of fill the histogram for each
163  // node spearately, which would require multiple iterations of the pqueries.
164  for( auto const& pquery : sample ) {
165  double const mult = total_multiplicity( pquery );
166 
167  for( auto const& placement : pquery.placements() ) {
168  for( size_t node_index = 0; node_index < sample.tree().node_count(); ++node_index ) {
169 
170  // Get the distance from the placement to the current histogram node.
171  double const p_dist = placement.proximal_length
172  + node_distances( node_index, placement.edge().primary_node().index() );
173  double const d_dist = placement.edge().data<PlacementEdgeData>().branch_length
174  - placement.proximal_length
175  + node_distances( node_index, placement.edge().secondary_node().index() );
176  double const dist = std::min( p_dist, d_dist );
177 
178  // Get the side of the placement relative to the current node.
179  // Value 1 means, it is on the root side. Values 0 and -1 mean a non root side.
180  // Use this to determine the sign used to mark the position in the histogram.
181  auto const side = node_sides( node_index, placement.edge().primary_node().index() );
182  double const sign = ( side == 1 ? 1.0 : -1.0 );
183 
184  // Sanity checks. They can fail if this function is used from the outside,
185  // but within our local context, they should hold.
186  // Nope, not true. It fails for example with negative proximal length,
187  // or proximal lengths > branch length, which can happen for numerical reasons,
188  // or because of bugs in the placement algortihm.
189  // So, do not check, but leave the sanity check up to the user.
190  // assert( dist >= 0.0 );
191  // assert( sign * dist >= histograms[ node_index ].range_min() );
192  // assert( sign * dist <= histograms[ node_index ].range_max() );
193 
194  // Accumulate at the distance, using the lwr and multiplicity as weight, so that the
195  // total weight of a pquery sums up to the multiplicity.
196  histograms[ node_index ].accumulate( sign * dist, mult * placement.like_weight_ratio );
197  }
198  }
199  }
200 
201  // Normalize.
202  for( auto& hist : histograms ) {
203  normalize( hist );
204  }
205 }
206 
207 // -------------------------------------------------------------------------------------------------
208 // Sample node_distance_histograms
209 // -------------------------------------------------------------------------------------------------
210 
214 std::vector< utils::Histogram > node_distance_histograms (
215  Sample const& sample,
216  size_t const histogram_bins,
217  bool use_negative_axis = true
218 ) {
219  // Calculate the pairwise distance between all pairs of nodes.
220  auto const node_distances = node_branch_length_distance_matrix( sample.tree() );
221 
222  // For each node, calculate which other nodes are on the root side subtree and which are not.
223  auto const node_sides = ( use_negative_axis
224  ? node_root_direction_matrix( sample.tree() )
225  : utils::Matrix<signed char>( sample.tree().node_count(), sample.tree().node_count(), 1 )
226  );
227 
228  // Make the histograms, fill them, return them.
229  auto histograms = make_empty_node_distance_histograms (
230  sample.tree(), node_distances, node_sides, histogram_bins
231  );
232  fill_node_distance_histograms ( sample, node_distances, node_sides, histograms );
233  return histograms;
234 }
235 
236 // -------------------------------------------------------------------------------------------------
237 // Sample Set node_distance_histograms
238 // -------------------------------------------------------------------------------------------------
239 
243 std::vector< std::vector< utils::Histogram >> node_distance_histograms (
244  SampleSet const& sample_set,
245  size_t const histogram_bins,
246  bool use_negative_axis = true
247 ) {
248  auto const set_size = sample_set.size();
249 
250  // Edge case.
251  if( set_size == 0 ) {
252  return {};
253  }
254 
255  // Prepare lookup for the trees. This assumes identical trees for all samples.
256  auto const node_distances = node_branch_length_distance_matrix( sample_set[0].sample.tree() );
257  auto const node_sides = ( use_negative_axis
258  ? node_root_direction_matrix( sample_set[0].sample.tree() )
260  sample_set[0].sample.tree().node_count(), sample_set[0].sample.tree().node_count(), 1
261  )
262  );
263 
264  // Prepare histograms for all samples, by copying empty histograms for the first sample.
265  auto result = std::vector< std::vector< utils::Histogram >>(
266  set_size,
268  sample_set[ 0 ].sample.tree(), node_distances, node_sides, histogram_bins
269  )
270  );
271 
272  // Calculate the histograms for all samples.
273  #pragma omp parallel for schedule(dynamic)
274  for( size_t i = 0; i < set_size; ++i ) {
275 
276  // Check compatibility.
277  // It suffices to check adjacent pairs of samples, as compatibility is transitive.
278  if( i > 0 ) {
279  if( ! compatible_trees( sample_set[ i - 1 ].sample, sample_set[ i ].sample )) {
280  throw std::invalid_argument(
281  "Trees in SampleSet not compatible for calculating Node Histogram Distance."
282  );
283  }
284  }
285 
286  // Fill the histograms for every node of the sample.
287  fill_node_distance_histograms ( sample_set[ i ].sample, node_distances, node_sides, result[ i ] );
288  assert( result[ i ].size() == sample_set[ i ].sample.tree().node_count() );
289  }
290 
291  return result;
292 }
293 
294 // -------------------------------------------------------------------------------------------------
295 // node_histogram_emd
296 // -------------------------------------------------------------------------------------------------
297 
302  std::vector< utils::Histogram > const& lhs,
303  std::vector< utils::Histogram > const& rhs,
304  size_t const node_count
305 ) {
306  // Sum up the emd distances of the histograms for each node of the tree in the
307  // two samples.
308  double dist = 0.0;
309  assert( lhs.size() == rhs.size() );
310  for( size_t k = 0; k < lhs.size(); ++k ) {
311  dist += earth_movers_distance( lhs[ k ], rhs[ k ], false );
312  }
313  assert( dist >= 0.0 );
314 
315  // Return normalized distance.
316  dist /= static_cast< double >( node_count );
317  return dist;
318 }
319 
320 // =================================================================================================
321 // Sample
322 // =================================================================================================
323 
325  Sample const& sample_a,
326  Sample const& sample_b,
327  size_t const histogram_bins,
328  bool use_negative_axis
329 ) {
330  if( ! compatible_trees( sample_a, sample_b ) ) {
331  throw std::invalid_argument( "Incompatible trees." );
332  }
333 
334  // Get the histograms describing the distances from placements to all nodes.
335  auto const hist_vec_a = node_distance_histograms( sample_a, histogram_bins, use_negative_axis );
336  auto const hist_vec_b = node_distance_histograms( sample_b, histogram_bins, use_negative_axis );
337 
338  // If the trees are compatible (as ensured in the beginning of this function), they need to have
339  // the same number of nodes. Thus, also there should be this number of histograms in the vectors.
340  assert( hist_vec_a.size() == sample_a.tree().node_count() );
341  assert( hist_vec_b.size() == sample_b.tree().node_count() );
342  assert( hist_vec_a.size() == hist_vec_b.size() );
343 
344  return node_histogram_emd( hist_vec_a, hist_vec_b, sample_a.tree().node_count() );
345 }
346 
347 // =================================================================================================
348 // Sample Set
349 // =================================================================================================
350 
352  SampleSet const& sample_set,
353  size_t const histogram_bins,
354  bool use_negative_axis
355 ) {
356  // Init distance matrix.
357  auto const set_size = sample_set.size();
358  auto result = utils::Matrix<double>( set_size, set_size, 0.0 );
359 
360  // Get the histograms to calculate the distance.
361  auto const hist_vecs = node_distance_histograms( sample_set, histogram_bins, use_negative_axis );
362 
363  // Parallel specialized code.
364  #ifdef GENESIS_OPENMP
365 
366  // We only need to calculate the upper triangle. Get the number of indices needed
367  // to describe this triangle.
368  size_t const max_k = utils::triangular_size( set_size );
369 
370  // Calculate distance matrix for every pair of samples.
371  #pragma omp parallel for
372  for( size_t k = 0; k < max_k; ++k ) {
373 
374  // For the given linear index, get the actual position in the Matrix.
375  auto const ij = utils::triangular_indices( k, set_size );
376  auto const i = ij.first;
377  auto const j = ij.second;
378 
379  // Calculate and store distance.
380  auto const node_count = sample_set[ i ].sample.tree().node_count();
381  auto const dist = node_histogram_emd( hist_vecs[ i ], hist_vecs[ j ], node_count );
382  result(i, j) = dist;
383  result(j, i) = dist;
384  }
385 
386  // If no threads are available at all, use serial version.
387  #else
388 
389  // Calculate distance matrix for every pair of samples.
390  for( size_t i = 0; i < set_size; ++i ) {
391 
392  // The result is symmetric - we only calculate the upper triangle.
393  for( size_t j = i + 1; j < set_size; ++j ) {
394 
395  // Calculate and store distance.
396  auto const node_count = sample_set[ i ].sample.tree().node_count();
397  auto const dist = node_histogram_emd( hist_vecs[ i ], hist_vecs[ j ], node_count );
398  result(i, j) = dist;
399  result(j, i) = dist;
400  }
401  }
402 
403  #endif
404 
405  return result;
406 }
407 
408 // =================================================================================================
409 // Previous Implementation
410 // =================================================================================================
411 
412 /*
413  * These functions are not accessible from the outside.
414  * We keep them here for now, just to be able to look them up quickly.
415  * Probably, they can be removed in the future.
416  */
417 
418 // -------------------------------------------------------------------------------------------------
419 // Helper Functions
420 // -------------------------------------------------------------------------------------------------
421 
427 std::vector< utils::Histogram > node_distance_histograms_old (
428  Sample const& sample,
429  size_t const histogram_bins
430 ) {
431  // Prepare a vector of histograms for each node of the tree.
432  auto hist_vec = std::vector< utils::Histogram >();
433  hist_vec.reserve( sample.tree().node_count() );
434 
435  // Calculate the pairwise distance between all pairs of nodes.
436  auto const node_dists = node_branch_length_distance_matrix( sample.tree() );
437 
438  // Calculate the longest distance from any node. This is used as upper bound for the histograms.
439  auto const diameters = furthest_leaf_distance_vector( sample.tree(), node_dists );
440 
441  // Init the histograms, using the diameter at each node as max values.
442  for( size_t node_index = 0; node_index < sample.tree().node_count(); ++node_index ) {
443  hist_vec.push_back( utils::Histogram( histogram_bins, 0.0, diameters[ node_index ].second ));
444  }
445 
446  // We use this order of loops, as it should be faster to touch each pquery once, and
447  // instead update the histograms multiple times, instead of calculating a histogram for each
448  // node spearately, which would require multiple iterations of the pqueries.
449  for( auto const& pquery : sample ) {
450  double const mult = total_multiplicity( pquery );
451 
452  for( auto const& placement : pquery.placements() ) {
453  for( size_t node_index = 0; node_index < sample.tree().node_count(); ++node_index ) {
454 
455  double const p_dist = placement.proximal_length
456  + node_dists( node_index, placement.edge().primary_node().index() );
457 
458  double const d_dist = placement.edge().data<PlacementEdgeData>().branch_length
459  - placement.proximal_length
460  + node_dists( node_index, placement.edge().secondary_node().index() );
461 
462  double const dist = std::min( p_dist, d_dist );
463  assert( dist >= 0.0 && dist <= diameters[ node_index ].second );
464 
465  // Accumulate at the distance, using the lwr and multiplicity as weight, so that the
466  // total weight of a pquery sums up to the multiplicity.
467  hist_vec[ node_index ].accumulate( dist, placement.like_weight_ratio * mult );
468  }
469  }
470  }
471 
472  // Normalize.
473  for( auto& hist : hist_vec ) {
474  normalize( hist );
475  }
476 
477  return hist_vec;
478 }
479 
480 // -------------------------------------------------------------------------------------------------
481 // Between Samples
482 // -------------------------------------------------------------------------------------------------
483 
485  Sample const& sample_a,
486  Sample const& sample_b,
487  size_t const histogram_bins
488 ) {
489  if( ! compatible_trees( sample_a, sample_b ) ) {
490  throw std::invalid_argument( "Incompatible trees." );
491  }
492 
493  // Get the histograms describing the distances from placements to all nodes.
494  auto const hist_vec_a = node_distance_histograms_old( sample_a, histogram_bins );
495  auto const hist_vec_b = node_distance_histograms_old( sample_b, histogram_bins );
496 
497  // If the trees are compatible (as ensured in the beginning of this function), they need to have
498  // the same number of nodes. Thus, also there should be this number of histograms in the vectors.
499  assert( hist_vec_a.size() == sample_a.tree().node_count() );
500  assert( hist_vec_b.size() == sample_b.tree().node_count() );
501  assert( hist_vec_a.size() == hist_vec_b.size() );
502 
503  return node_histogram_emd( hist_vec_a, hist_vec_b, sample_a.tree().node_count() );
504 }
505 
506 // -------------------------------------------------------------------------------------------------
507 // Full Sample Set
508 // -------------------------------------------------------------------------------------------------
509 
511  SampleSet const& sample_set,
512  size_t const histogram_bins
513 ) {
514  auto const set_size = sample_set.size();
515 
516  // Prepare histograms for all samples.
517  auto hist_vecs = std::vector< std::vector< utils::Histogram >>( set_size );
518 
519  // Calculate the histograms for all samples.
520  #pragma omp parallel for schedule(dynamic)
521  for( size_t i = 0; i < set_size; ++i ) {
522 
523  // Check compatibility.
524  // It suffices to check adjacent pairs of samples, as compatibility is transitive.
525  if( i > 0 ) {
526  if( ! compatible_trees( sample_set[ i - 1 ].sample, sample_set[ i ].sample )) {
527  throw std::invalid_argument(
528  "Trees in SampleSet not compatible for calculating Node Histogram Distance."
529  );
530  }
531  }
532 
533  // Calculate the histograms for every node of the sample.
534  hist_vecs[ i ] = node_distance_histograms_old( sample_set[ i ].sample, histogram_bins );
535  assert( hist_vecs[ i ].size() == sample_set[ i ].sample.tree().node_count() );
536  }
537 
538  // Init distance matrix.
539  auto result = utils::Matrix<double>( set_size, set_size, 0.0 );
540 
541  // Parallel specialized code.
542  #ifdef GENESIS_OPENMP
543 
544  // We only need to calculate the upper triangle. Get the number of indices needed
545  // to describe this triangle.
546  size_t const max_k = utils::triangular_size( set_size );
547 
548  // Calculate distance matrix for every pair of samples.
549  #pragma omp parallel for
550  for( size_t k = 0; k < max_k; ++k ) {
551 
552  // For the given linear index, get the actual position in the Matrix.
553  auto const ij = utils::triangular_indices( k, set_size );
554  auto const i = ij.first;
555  auto const j = ij.second;
556 
557  // Calculate and store distance.
558  auto const node_count = sample_set[ i ].sample.tree().node_count();
559  auto const dist = node_histogram_emd( hist_vecs[ i ], hist_vecs[ j ], node_count );
560  result(i, j) = dist;
561  result(j, i) = dist;
562  }
563 
564  // If no threads are available at all, use serial version.
565  #else
566 
567  // Calculate distance matrix for every pair of samples.
568  for( size_t i = 0; i < set_size; ++i ) {
569 
570  // The result is symmetric - we only calculate the upper triangle.
571  for( size_t j = i + 1; j < set_size; ++j ) {
572 
573  // Calculate and store distance.
574  auto const node_count = sample_set[ i ].sample.tree().node_count();
575  auto const dist = node_histogram_emd( hist_vecs[ i ], hist_vecs[ j ], node_count );
576  result(i, j) = dist;
577  result(j, i) = dist;
578  }
579  }
580 
581  #endif
582 
583  return result;
584 }
585 
586 } // namespace placement
587 } // namespace genesis
double node_histogram_distance(Sample const &sample_a, Sample const &sample_b, size_t const histogram_bins, bool use_negative_axis)
Calculate the Node Histogram Distance of two Samples.
Definition: nhd.cpp:324
double node_histogram_emd(std::vector< utils::Histogram > const &lhs, std::vector< utils::Histogram > const &rhs, size_t const node_count)
Local helper function to calculate the sum of histogram emds between two sets of histograms.
Definition: nhd.cpp:301
size_t cols() const
Definition: matrix.hpp:156
Header of Histogram class.
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num, as defined in the jplace standard.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
bool compatible_trees(PlacementTree const &lhs, PlacementTree const &rhs)
Return whether two PlacementTrees are compatible.
std::vector< T > const & data() const
Definition: matrix.hpp:166
utils::Matrix< signed char > node_root_direction_matrix(Tree const &tree)
Calculate a Matrix that indicates the nodes on the root side of a given node.
double earth_movers_distance(Sample const &lhs, Sample const &rhs, double const p, bool const with_pendant_length)
Calculate the earth mover's distance between two Samples.
Histogram class for accumulating and summarizing data.
Definition: histogram.hpp:68
Provides functions for working with Placements and Pqueries.
std::vector< utils::Histogram > node_distance_histograms_old(Sample const &sample, size_t const histogram_bins)
Local helper function to calculate the histograms of distances from all Nodes of the Tree of a Sample...
Definition: nhd.cpp:427
std::vector< utils::Histogram > make_empty_node_distance_histograms(tree::Tree const &tree, utils::Matrix< double > const &node_distances, utils::Matrix< signed char > const &node_sides, size_t const histogram_bins)
Create a set of Histograms without any weights for a given Tree.
Definition: nhd.cpp:71
Header of Histogram operations functions.
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
double total_multiplicity(Pquery const &pqry)
Return the sum of all multiplicities of the Pquery.
Header of Histogram operator functions.
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
size_t triangular_size(size_t n)
Calculate the number of linear indices needed for a triangular Matrix of size n.
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.cpp:222
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
double node_histogram_distance_old(Sample const &sample_a, Sample const &sample_b, size_t const histogram_bins)
Definition: nhd.cpp:484
void fill_node_distance_histograms(Sample const &sample, utils::Matrix< double > const &node_distances, utils::Matrix< signed char > const &node_sides, std::vector< utils::Histogram > &histograms)
Fill the placements of a Sample into Histograms.
Definition: nhd.cpp:144
Header of Default Tree distance methods.
Header of Histogram Accumulator class.
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.cpp:135
void normalize(Histogram &h, double total)
Definition: operations.cpp:61
std::vector< std::pair< TreeNode const *, double > > furthest_leaf_distance_vector(Tree const &tree)
Opposite of closest_leaf_distance_vector().
std::vector< utils::Histogram > node_distance_histograms(Sample const &sample, size_t const histogram_bins, bool use_negative_axis=true)
Local helper function that calculates the Histograms for a Sample.
Definition: nhd.cpp:214
std::pair< size_t, size_t > triangular_indices(size_t k, size_t n)
Given a linear index in a upper triangular Matrix, find the corresponding Matrix indices.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
size_t rows() const
Definition: matrix.hpp:151
Header of Histogram distance functions.
Matrix operators.
Header of Tree distance methods.