A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
nhd.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
40 
44 
47 
48 #include <algorithm>
49 #include <cassert>
50 #include <cmath>
51 #include <stdexcept>
52 
53 #ifdef GENESIS_OPENMP
54 # include <omp.h>
55 #endif
56 
57 namespace genesis {
58 namespace placement {
59 
60 // =================================================================================================
61 // Local Helper Functions
62 // =================================================================================================
63 
64 // -------------------------------------------------------------------------------------------------
65 // make_empty_node_distance_histogram_set
66 // -------------------------------------------------------------------------------------------------
67 
72  tree::Tree const& tree,
73  utils::Matrix<double> const& node_distances,
74  utils::Matrix<signed char> const& node_sides,
75  size_t const histogram_bins
76 ) {
77  auto const node_count = tree.node_count();
78  if( tree.empty() ) {
79  throw std::runtime_error( "Tree is empty. Cannot use Node Histogram Distance." );
80  }
81  if( node_distances.rows() != node_count || node_distances.cols() != node_count ) {
82  throw std::runtime_error( "Node Distance Matrix has wrong size." );
83  }
84  if( node_sides.rows() != node_count || node_sides.cols() != node_count ) {
85  throw std::runtime_error( "Node Sides Matrix has wrong size." );
86  }
87 
88  // Prepare a vector of histograms for each node of the tree.
89  // We init with default values, so that we can better parallelize later.
90  NodeDistanceHistogramSet histogram_set;
91  histogram_set.histograms.resize( node_count );
92 
93  // Make histograms that have enough room on both sides.
94  #pragma omp parallel for
95  for( size_t node_idx = 0; node_idx < node_count; ++node_idx ) {
96 
97  // Find furthest nodes on root and non-root sides.
98  // For now, we use both positive values, and later reverse the sign of the min entry.
99  double min = 0.0;
100  double max = 0.0;
101  for( size_t col_idx = 0; col_idx < node_count; ++col_idx ) {
102 
103  // No need to get a distance from the node to itself.
104  if( col_idx == node_idx ) {
105  // assert( node_sides( node_idx, col_idx ) == 0 );
106  continue;
107  }
108 
109  // Get distance between nodes.
110  auto const dist = node_distances( node_idx, col_idx );
111 
112  // If we are at a node that is on the root side, optimize max.
113  if( node_sides( node_idx, col_idx ) == 1 && dist > max ) {
114  max = dist;
115  }
116 
117  // If we are at a node that is on a non-root side, optimize min.
118  if( node_sides( node_idx, col_idx ) == -1 && dist > min ) {
119  min = dist;
120  }
121  }
122 
123  // If this fails, the tree is not usable.
124  if( min == 0.0 && max == 0.0 ) {
125  throw std::runtime_error(
126  "Tree only has branch lengths with value 0. Cannot use Node Histogram Distance."
127  );
128  }
129 
130  histogram_set.histograms[ node_idx ].min = -min;
131  histogram_set.histograms[ node_idx ].max = max;
132  histogram_set.histograms[ node_idx ].bins = std::vector<double>( histogram_bins, 0.0 );
133  }
134 
135  return histogram_set;
136 }
137 
138 // -------------------------------------------------------------------------------------------------
139 // fill_node_distance_histograms
140 // -------------------------------------------------------------------------------------------------
141 
146  Sample const& sample,
147  utils::Matrix<double> const& node_distances,
148  utils::Matrix<signed char> const& node_sides,
149  NodeDistanceHistogramSet& histogram_set
150 ) {
151  // Basic checks.
152  auto const node_count = sample.tree().node_count();
153  if( histogram_set.histograms.size() != node_count ) {
154  throw std::runtime_error( "Number of histograms does not equal number of tree nodes." );
155  }
156  if( node_distances.rows() != node_count || node_distances.cols() != node_count ) {
157  throw std::runtime_error( "Node Distance Matrix has wrong size." );
158  }
159  if( node_sides.rows() != node_count || node_sides.cols() != node_count ) {
160  throw std::runtime_error( "Node Sides Matrix has wrong size." );
161  }
162 
163  // Convert placements to plain form. We are later going to loop over them for every node of the
164  // tree, so this plain form speeds things up a lot there.
165  auto placements = std::vector<PqueryPlacementPlain>( total_placement_count(sample) );
166  size_t pcnt = 0;
167  for( auto const& pquery : sample ) {
168  double const mult = total_multiplicity( pquery );
169 
170  for( auto const& placement : pquery.placements() ) {
171  auto& place = placements[pcnt];
172 
173  place.edge_index = placement.edge().index();
174  place.primary_node_index = placement.edge().primary_node().index();
175  place.secondary_node_index = placement.edge().secondary_node().index();
176 
177  auto const& placement_data = placement.edge().data<PlacementEdgeData>();
178  place.branch_length = placement_data.branch_length;
179  place.pendant_length = placement.pendant_length;
180  place.proximal_length = placement.proximal_length;
181 
182  // We cheat a bit and store the multiplicity right here with the LWR.
183  // That is okay, because the data is only used within the function.
184  place.like_weight_ratio = placement.like_weight_ratio * mult;
185 
186  ++pcnt;
187  }
188  }
189 
190  // Fill the histogram of every node.
191  for( size_t node_index = 0; node_index < sample.tree().node_count(); ++node_index ) {
192 
193  // Prepare.
194  auto& histogram = histogram_set.histograms[ node_index ];
195  const auto min = histogram.min;
196  const auto max = histogram.max;
197  const auto bins = histogram.bins.size();
198  const double bin_width = ( max - min ) / static_cast<double>( bins );
199  double sum = 0.0;
200 
201  // Add all placements to the histogram for the current node.
202  for( auto const& placement : placements ) {
203 
204  // Get the distance from the placement to the current histogram node.
205  double const p_dist = placement.proximal_length
206  + node_distances( node_index, placement.primary_node_index )
207  ;
208  double const d_dist = placement.branch_length
209  - placement.proximal_length
210  + node_distances( node_index, placement.secondary_node_index )
211  ;
212  double const dist = std::min( p_dist, d_dist );
213 
214  // Get the side of the placement relative to the current node.
215  // Value 1 means, it is on the root side. Values 0 and -1 mean a non root side.
216  // Use this to determine the sign used to mark the position in the histogram.
217  auto const side = node_sides( node_index, placement.primary_node_index );
218  double const sign = ( side == 1 ? 1.0 : -1.0 );
219 
220  // Calcualte the bin index.
221  auto const x = sign * dist;
222  size_t bin = 0;
223  if( x < min ) {
224  bin = 0;
225  } else if( x >= max ) {
226  bin = bins - 1;
227  } else {
228  bin = static_cast<size_t>(( x - min ) / bin_width );
229  assert( bin < bins );
230  }
231 
232  // Accumulate the weight at the bin.
233  histogram.bins[ bin ] += placement.like_weight_ratio;
234  sum += placement.like_weight_ratio;
235  }
236 
237  // Normalize.
238  for( auto& val : histogram.bins ) {
239  val /= sum;
240  }
241  }
242 }
243 
244 // =================================================================================================
245 // Basic Functions
246 // =================================================================================================
247 
248 // -------------------------------------------------------------------------------------------------
249 // Sample node_distance_histogram_set
250 // -------------------------------------------------------------------------------------------------
251 
253  Sample const& sample,
254  utils::Matrix<double> const& node_distances,
255  utils::Matrix<signed char> const& node_sides,
256  size_t const histogram_bins
257 ) {
258  // Make the histograms, fill them, return them.
259  auto histograms = make_empty_node_distance_histogram_set(
260  sample.tree(), node_distances, node_sides, histogram_bins
261  );
262  fill_node_distance_histogram_set( sample, node_distances, node_sides, histograms );
263  return histograms;
264 }
265 
266 // -------------------------------------------------------------------------------------------------
267 // node_histogram_distance
268 // -------------------------------------------------------------------------------------------------
269 
271  NodeDistanceHistogram const& lhs,
272  NodeDistanceHistogram const& rhs
273 ) {
274  if( lhs.bins.size() != rhs.bins.size() || lhs.min != rhs.min || lhs.max != rhs.max ){
275  throw std::runtime_error(
276  "Cannot calculate distance between NodeDistanceHistograms of different dimensions."
277  );
278  }
279 
280  // Prepare.
281  auto entries = std::vector<double>( lhs.bins.size(), 0.0 );
282  double result = 0.0;
283  const double bin_width = ( lhs.max - lhs.min ) / static_cast<double>( lhs.bins.size() );
284 
285  // Loop and "move" masses.
286  for( size_t i = 0; i < lhs.bins.size() - 1; ++i) {
287  entries[i + 1] = lhs.bins[i] + entries[i] - rhs.bins[i];
288  result += std::abs( entries[i + 1] ) * bin_width;
289  }
290 
291  return result;
292 }
293 
295  NodeDistanceHistogramSet const& lhs,
296  NodeDistanceHistogramSet const& rhs
297 ) {
298  if( lhs.histograms.size() != rhs.histograms.size() ){
299  throw std::runtime_error(
300  "Cannot calculate distance between NodeDistanceHistogramSets of different size."
301  );
302  }
303 
304  // Sum up the emd distances of the histograms for each node of the tree in the
305  // two samples.
306  double dist = 0.0;
307  for( size_t k = 0; k < lhs.histograms.size(); ++k ) {
308  dist += node_histogram_distance( lhs.histograms[ k ], rhs.histograms[ k ] );
309  }
310  assert( dist >= 0.0 );
311 
312  // Return normalized distance.
313  dist /= static_cast< double >( lhs.histograms.size() );
314  return dist;
315 }
316 
318  std::vector<NodeDistanceHistogramSet> const& histogram_sets
319 ) {
320  // Init distance matrix.
321  auto const set_size = histogram_sets.size();
322  auto result = utils::Matrix<double>( set_size, set_size, 0.0 );
323 
324  // Parallel specialized code.
325  #ifdef GENESIS_OPENMP
326 
327  // We only need to calculate the upper triangle. Get the number of indices needed
328  // to describe this triangle.
329  size_t const max_k = utils::triangular_size( set_size );
330 
331  // Calculate distance matrix for every pair of samples.
332  #pragma omp parallel for
333  for( size_t k = 0; k < max_k; ++k ) {
334 
335  // For the given linear index, get the actual position in the Matrix.
336  auto const ij = utils::triangular_indices( k, set_size );
337  auto const i = ij.first;
338  auto const j = ij.second;
339 
340  // Calculate and store distance.
341  auto const dist = node_histogram_distance( histogram_sets[ i ], histogram_sets[ j ] );
342  result(i, j) = dist;
343  result(j, i) = dist;
344  }
345 
346  // If no threads are available at all, use serial version.
347  #else
348 
349  // Calculate distance matrix for every pair of samples.
350  for( size_t i = 0; i < set_size; ++i ) {
351 
352  // The result is symmetric - we only calculate the upper triangle.
353  for( size_t j = i + 1; j < set_size; ++j ) {
354 
355  // Calculate and store distance.
356  auto const dist = node_histogram_distance( histogram_sets[ i ], histogram_sets[ j ] );
357  result(i, j) = dist;
358  result(j, i) = dist;
359  }
360  }
361 
362  #endif
363 
364  return result;
365 }
366 
367 // =================================================================================================
368 // High Level Functions
369 // =================================================================================================
370 
371 // -------------------------------------------------------------------------------------------------
372 // Sample
373 // -------------------------------------------------------------------------------------------------
374 
376  Sample const& sample,
377  size_t const histogram_bins
378 ) {
379  // Calculate the pairwise distance between all pairs of nodes.
380  auto const node_distances = node_branch_length_distance_matrix( sample.tree() );
381 
382  // For each node, calculate which other nodes are on the root side subtree and which are not.
383  auto const node_sides = node_root_direction_matrix( sample.tree() );
384 
385  // Make the histograms, fill them, return them.
386  auto histograms = make_empty_node_distance_histogram_set(
387  sample.tree(), node_distances, node_sides, histogram_bins
388  );
389  fill_node_distance_histogram_set( sample, node_distances, node_sides, histograms );
390  return histograms;
391 }
392 
394  Sample const& sample_a,
395  Sample const& sample_b,
396  size_t const histogram_bins
397 ) {
398  if( ! compatible_trees( sample_a, sample_b ) ) {
399  throw std::invalid_argument( "Incompatible trees." );
400  }
401 
402  // Get the histograms describing the distances from placements to all nodes.
403  auto const hist_vec_a = node_distance_histogram_set( sample_a, histogram_bins );
404  auto const hist_vec_b = node_distance_histogram_set( sample_b, histogram_bins );
405 
406  // If the trees are compatible (as ensured in the beginning of this function), they need to have
407  // the same number of nodes. Thus, also there should be this number of histograms in the vectors.
408  assert( hist_vec_a.histograms.size() == sample_a.tree().node_count() );
409  assert( hist_vec_b.histograms.size() == sample_b.tree().node_count() );
410  assert( hist_vec_a.histograms.size() == hist_vec_b.histograms.size() );
411 
412  return node_histogram_distance( hist_vec_a, hist_vec_b );
413 }
414 
415 // -------------------------------------------------------------------------------------------------
416 // Sample Set
417 // -------------------------------------------------------------------------------------------------
418 
422 std::vector<NodeDistanceHistogramSet> node_distance_histogram_set(
423  SampleSet const& sample_set,
424  size_t const histogram_bins
425 ) {
426  auto const set_size = sample_set.size();
427 
428  // Edge case.
429  if( set_size == 0 ) {
430  return {};
431  }
432 
433  // Prepare lookup for the trees. This assumes identical trees for all samples.
434  auto const node_distances = node_branch_length_distance_matrix( sample_set[0].sample.tree() );
435  auto const node_sides = node_root_direction_matrix( sample_set[0].sample.tree() );
436 
437  // Prepare histograms for all samples, by copying empty histograms for the first sample.
438  auto const empty_hist = make_empty_node_distance_histogram_set(
439  sample_set[ 0 ].sample.tree(), node_distances, node_sides, histogram_bins
440  );
441  auto result = std::vector<NodeDistanceHistogramSet>( set_size, empty_hist );
442 
443  // Calculate the histograms for all samples.
444  #pragma omp parallel for schedule(dynamic)
445  for( size_t i = 0; i < set_size; ++i ) {
446 
447  // Check compatibility.
448  // It suffices to check adjacent pairs of samples, as compatibility is transitive.
449  if( i > 0 ) {
450  if( ! compatible_trees( sample_set[ i - 1 ].sample, sample_set[ i ].sample )) {
451  throw std::invalid_argument(
452  "Trees in SampleSet not compatible for calculating Node Histogram Distance."
453  );
454  }
455  }
456 
457  // Fill the histograms for every node of the sample.
459  sample_set[ i ].sample, node_distances, node_sides, result[ i ]
460  );
461  assert( result[ i ].histograms.size() == sample_set[ i ].sample.tree().node_count() );
462  }
463 
464  return result;
465 }
466 
468  SampleSet const& sample_set,
469  size_t const histogram_bins
470 ) {
471  // Get the histograms and calculate the distance.
472  auto const hist_vecs = node_distance_histogram_set( sample_set, histogram_bins );
473  return node_histogram_distance( hist_vecs );
474 }
475 
476 } // namespace placement
477 } // namespace genesis
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num, as defined in the jplace standard.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
bool compatible_trees(PlacementTree const &lhs, PlacementTree const &rhs)
Return whether two PlacementTrees are compatible.
std::vector< double > bins
Definition: nhd.hpp:79
utils::Matrix< signed char > node_root_direction_matrix(Tree const &tree)
Calculate a Matrix that indicates the nodes on the root side of a given node.
std::vector< NodeDistanceHistogram > histograms
Definition: nhd.hpp:87
void fill_node_distance_histogram_set(Sample const &sample, utils::Matrix< double > const &node_distances, utils::Matrix< signed char > const &node_sides, NodeDistanceHistogramSet &histogram_set)
Fill the placements of a Sample into Histograms.
Definition: nhd.cpp:145
Provides functions for working with Placements and Pqueries.
NodeDistanceHistogramSet node_distance_histogram_set(Sample const &sample, utils::Matrix< double > const &node_distances, utils::Matrix< signed char > const &node_sides, size_t const histogram_bins)
Calcualte the NodeDistanceHistogramSet representing a single Sample, given the necessary matrices of ...
Definition: nhd.cpp:252
double sum(const Histogram &h)
double node_histogram_distance(NodeDistanceHistogram const &lhs, NodeDistanceHistogram const &rhs)
Definition: nhd.cpp:270
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
Matrix operators.
Collection of NodeDistanceHistograms that describes one Sample.
Definition: nhd.hpp:85
double total_multiplicity(Pquery const &pqry)
Return the sum of all multiplicities of the Pquery.
Definition: masses.cpp:65
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
size_t triangular_size(size_t n)
Calculate the number of linear indices needed for a triangular Matrix of size n.
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.cpp:222
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
Header of Default Tree distance methods.
Header of PqueryPlain class.
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.hpp:234
Simple histogram data structure with equal sized bins.
Definition: nhd.hpp:75
double branch_length
Branch length of the edge.
std::pair< size_t, size_t > triangular_indices(size_t k, size_t n)
Given a linear index in a upper triangular Matrix, find the corresponding Matrix indices.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
size_t node_count(Tree const &tree)
Return the number of Nodes of a Tree. Same as Tree::node_count().
Header of Tree distance methods.
NodeDistanceHistogramSet make_empty_node_distance_histogram_set(tree::Tree const &tree, utils::Matrix< double > const &node_distances, utils::Matrix< signed char > const &node_sides, size_t const histogram_bins)
Create a set of Histograms without any weights for a given Tree.
Definition: nhd.cpp:71
size_t total_placement_count(Sample const &smp)
Get the total number of PqueryPlacements in all Pqueries of the given Sample.