A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
nhd.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
39 
43 
46 
47 #include <algorithm>
48 #include <cassert>
49 #include <cmath>
50 #include <stdexcept>
51 
52 #ifdef GENESIS_OPENMP
53 # include <omp.h>
54 #endif
55 
56 namespace genesis {
57 namespace placement {
58 
59 // =================================================================================================
60 // Local Helper Functions
61 // =================================================================================================
62 
63 // -------------------------------------------------------------------------------------------------
64 // make_empty_node_distance_histogram_set
65 // -------------------------------------------------------------------------------------------------
66 
71  tree::Tree const& tree,
72  utils::Matrix<double> const& node_distances,
73  utils::Matrix<signed char> const& node_sides,
74  size_t const histogram_bins
75 ) {
76  auto const node_count = tree.node_count();
77  if( tree.empty() ) {
78  throw std::runtime_error( "Tree is empty. Cannot use Node Histogram Distance." );
79  }
80  if( node_distances.rows() != node_count || node_distances.cols() != node_count ) {
81  throw std::runtime_error( "Node Distance Matrix has wrong size." );
82  }
83  if( node_sides.rows() != node_count || node_sides.cols() != node_count ) {
84  throw std::runtime_error( "Node Sides Matrix has wrong size." );
85  }
86 
87  // Prepare a vector of histograms for each node of the tree.
88  // We init with default values, so that we can better parallelize later.
89  NodeDistanceHistogramSet histogram_set;
90  histogram_set.histograms.resize( node_count );
91 
92  // Make histograms that have enough room on both sides.
93  #pragma omp parallel for
94  for( size_t node_idx = 0; node_idx < node_count; ++node_idx ) {
95 
96  // Find furthest nodes on root and non-root sides.
97  // For now, we use both positive values, and later reverse the sign of the min entry.
98  double min = 0.0;
99  double max = 0.0;
100  for( size_t col_idx = 0; col_idx < node_count; ++col_idx ) {
101 
102  // No need to get a distance from the node to itself.
103  if( col_idx == node_idx ) {
104  // assert( node_sides( node_idx, col_idx ) == 0 );
105  continue;
106  }
107 
108  // Get distance between nodes.
109  auto const dist = node_distances( node_idx, col_idx );
110 
111  // If we are at a node that is on the root side, optimize max.
112  if( node_sides( node_idx, col_idx ) == 1 && dist > max ) {
113  max = dist;
114  }
115 
116  // If we are at a node that is on a non-root side, optimize min.
117  if( node_sides( node_idx, col_idx ) == -1 && dist > min ) {
118  min = dist;
119  }
120  }
121 
122  // If this fails, the tree is not usable.
123  if( min == 0.0 && max == 0.0 ) {
124  throw std::runtime_error(
125  "Tree only has branch lengths with value 0. Cannot use Node Histogram Distance."
126  );
127  }
128 
129  histogram_set.histograms[ node_idx ].min = -min;
130  histogram_set.histograms[ node_idx ].max = max;
131  histogram_set.histograms[ node_idx ].bins = std::vector<double>( histogram_bins, 0.0 );
132  }
133 
134  return histogram_set;
135 }
136 
137 // -------------------------------------------------------------------------------------------------
138 // fill_node_distance_histograms
139 // -------------------------------------------------------------------------------------------------
140 
145  Sample const& sample,
146  utils::Matrix<double> const& node_distances,
147  utils::Matrix<signed char> const& node_sides,
148  NodeDistanceHistogramSet& histogram_set
149 ) {
150  // Basic checks.
151  auto const node_count = sample.tree().node_count();
152  if( histogram_set.histograms.size() != node_count ) {
153  throw std::runtime_error( "Number of histograms does not equal number of tree nodes." );
154  }
155  if( node_distances.rows() != node_count || node_distances.cols() != node_count ) {
156  throw std::runtime_error( "Node Distance Matrix has wrong size." );
157  }
158  if( node_sides.rows() != node_count || node_sides.cols() != node_count ) {
159  throw std::runtime_error( "Node Sides Matrix has wrong size." );
160  }
161 
162  // Convert placements to plain form. We are later going to loop over them for every node of the
163  // tree, so this plain form speeds things up a lot there.
164  auto placements = std::vector<PqueryPlacementPlain>( total_placement_count(sample) );
165  size_t pcnt = 0;
166  for( auto const& pquery : sample ) {
167  double const mult = total_multiplicity( pquery );
168 
169  for( auto const& placement : pquery.placements() ) {
170  auto& place = placements[pcnt];
171 
172  place.edge_index = placement.edge().index();
173  place.primary_node_index = placement.edge().primary_node().index();
174  place.secondary_node_index = placement.edge().secondary_node().index();
175 
176  auto const& placement_data = placement.edge().data<PlacementEdgeData>();
177  place.branch_length = placement_data.branch_length;
178  place.pendant_length = placement.pendant_length;
179  place.proximal_length = placement.proximal_length;
180 
181  // We cheat a bit and store the multiplicity right here with the LWR.
182  // That is okay, because the data is only used within the function.
183  place.like_weight_ratio = placement.like_weight_ratio * mult;
184 
185  ++pcnt;
186  }
187  }
188 
189  // Fill the histogram of every node.
190  for( size_t node_index = 0; node_index < sample.tree().node_count(); ++node_index ) {
191 
192  // Prepare.
193  auto& histogram = histogram_set.histograms[ node_index ];
194  const auto min = histogram.min;
195  const auto max = histogram.max;
196  const auto bins = histogram.bins.size();
197  const double bin_width = ( max - min ) / static_cast<double>( bins );
198  double sum = 0.0;
199 
200  // Add all placements to the histogram for the current node.
201  for( auto const& placement : placements ) {
202 
203  // Get the distance from the placement to the current histogram node.
204  double const p_dist = placement.proximal_length
205  + node_distances( node_index, placement.primary_node_index )
206  ;
207  double const d_dist = placement.branch_length
208  - placement.proximal_length
209  + node_distances( node_index, placement.secondary_node_index )
210  ;
211  double const dist = std::min( p_dist, d_dist );
212 
213  // Get the side of the placement relative to the current node.
214  // Value 1 means, it is on the root side. Values 0 and -1 mean a non root side.
215  // Use this to determine the sign used to mark the position in the histogram.
216  auto const side = node_sides( node_index, placement.primary_node_index );
217  double const sign = ( side == 1 ? 1.0 : -1.0 );
218 
219  // Calcualte the bin index.
220  auto const x = sign * dist;
221  size_t bin = 0;
222  if( x < min ) {
223  bin = 0;
224  } else if( x >= max ) {
225  bin = bins - 1;
226  } else {
227  bin = static_cast<size_t>(( x - min ) / bin_width );
228  assert( bin < bins );
229  }
230 
231  // Accumulate the weight at the bin.
232  histogram.bins[ bin ] += placement.like_weight_ratio;
233  sum += placement.like_weight_ratio;
234  }
235 
236  // Normalize.
237  for( auto& val : histogram.bins ) {
238  val /= sum;
239  }
240  }
241 }
242 
243 // =================================================================================================
244 // Basic Functions
245 // =================================================================================================
246 
247 // -------------------------------------------------------------------------------------------------
248 // Sample node_distance_histogram_set
249 // -------------------------------------------------------------------------------------------------
250 
252  Sample const& sample,
253  utils::Matrix<double> const& node_distances,
254  utils::Matrix<signed char> const& node_sides,
255  size_t const histogram_bins
256 ) {
257  // Make the histograms, fill them, return them.
258  auto histograms = make_empty_node_distance_histogram_set(
259  sample.tree(), node_distances, node_sides, histogram_bins
260  );
261  fill_node_distance_histogram_set( sample, node_distances, node_sides, histograms );
262  return histograms;
263 }
264 
265 // -------------------------------------------------------------------------------------------------
266 // node_histogram_distance
267 // -------------------------------------------------------------------------------------------------
268 
270  NodeDistanceHistogram const& lhs,
271  NodeDistanceHistogram const& rhs
272 ) {
273  if( lhs.bins.size() != rhs.bins.size() || lhs.min != rhs.min || lhs.max != rhs.max ){
274  throw std::runtime_error(
275  "Cannot calculate distance between NodeDistanceHistograms of different dimensions."
276  );
277  }
278 
279  // Prepare.
280  auto entries = std::vector<double>( lhs.bins.size(), 0.0 );
281  double result = 0.0;
282  const double bin_width = ( lhs.max - lhs.min ) / static_cast<double>( lhs.bins.size() );
283 
284  // Loop and "move" masses.
285  for( size_t i = 0; i < lhs.bins.size() - 1; ++i) {
286  entries[i + 1] = lhs.bins[i] + entries[i] - rhs.bins[i];
287  result += std::abs( entries[i + 1] ) * bin_width;
288  }
289 
290  return result;
291 }
292 
294  NodeDistanceHistogramSet const& lhs,
295  NodeDistanceHistogramSet const& rhs
296 ) {
297  if( lhs.histograms.size() != rhs.histograms.size() ){
298  throw std::runtime_error(
299  "Cannot calculate distance between NodeDistanceHistogramSets of different size."
300  );
301  }
302 
303  // Sum up the emd distances of the histograms for each node of the tree in the
304  // two samples.
305  double dist = 0.0;
306  for( size_t k = 0; k < lhs.histograms.size(); ++k ) {
307  dist += node_histogram_distance( lhs.histograms[ k ], rhs.histograms[ k ] );
308  }
309  assert( dist >= 0.0 );
310 
311  // Return normalized distance.
312  dist /= static_cast< double >( lhs.histograms.size() );
313  return dist;
314 }
315 
317  std::vector<NodeDistanceHistogramSet> const& histogram_sets
318 ) {
319  // Init distance matrix.
320  auto const set_size = histogram_sets.size();
321  auto result = utils::Matrix<double>( set_size, set_size, 0.0 );
322 
323  // Parallel specialized code.
324  #ifdef GENESIS_OPENMP
325 
326  // We only need to calculate the upper triangle. Get the number of indices needed
327  // to describe this triangle.
328  size_t const max_k = utils::triangular_size( set_size );
329 
330  // Calculate distance matrix for every pair of samples.
331  #pragma omp parallel for
332  for( size_t k = 0; k < max_k; ++k ) {
333 
334  // For the given linear index, get the actual position in the Matrix.
335  auto const ij = utils::triangular_indices( k, set_size );
336  auto const i = ij.first;
337  auto const j = ij.second;
338 
339  // Calculate and store distance.
340  auto const dist = node_histogram_distance( histogram_sets[ i ], histogram_sets[ j ] );
341  result(i, j) = dist;
342  result(j, i) = dist;
343  }
344 
345  // If no threads are available at all, use serial version.
346  #else
347 
348  // Calculate distance matrix for every pair of samples.
349  for( size_t i = 0; i < set_size; ++i ) {
350 
351  // The result is symmetric - we only calculate the upper triangle.
352  for( size_t j = i + 1; j < set_size; ++j ) {
353 
354  // Calculate and store distance.
355  auto const dist = node_histogram_distance( histogram_sets[ i ], histogram_sets[ j ] );
356  result(i, j) = dist;
357  result(j, i) = dist;
358  }
359  }
360 
361  #endif
362 
363  return result;
364 }
365 
366 // =================================================================================================
367 // High Level Functions
368 // =================================================================================================
369 
370 // -------------------------------------------------------------------------------------------------
371 // Sample
372 // -------------------------------------------------------------------------------------------------
373 
375  Sample const& sample,
376  size_t const histogram_bins
377 ) {
378  // Calculate the pairwise distance between all pairs of nodes.
379  auto const node_distances = node_branch_length_distance_matrix( sample.tree() );
380 
381  // For each node, calculate which other nodes are on the root side subtree and which are not.
382  auto const node_sides = node_root_direction_matrix( sample.tree() );
383 
384  // Make the histograms, fill them, return them.
385  auto histograms = make_empty_node_distance_histogram_set(
386  sample.tree(), node_distances, node_sides, histogram_bins
387  );
388  fill_node_distance_histogram_set( sample, node_distances, node_sides, histograms );
389  return histograms;
390 }
391 
393  Sample const& sample_a,
394  Sample const& sample_b,
395  size_t const histogram_bins
396 ) {
397  if( ! compatible_trees( sample_a, sample_b ) ) {
398  throw std::invalid_argument( "Incompatible trees." );
399  }
400 
401  // Get the histograms describing the distances from placements to all nodes.
402  auto const hist_vec_a = node_distance_histogram_set( sample_a, histogram_bins );
403  auto const hist_vec_b = node_distance_histogram_set( sample_b, histogram_bins );
404 
405  // If the trees are compatible (as ensured in the beginning of this function), they need to have
406  // the same number of nodes. Thus, also there should be this number of histograms in the vectors.
407  assert( hist_vec_a.histograms.size() == sample_a.tree().node_count() );
408  assert( hist_vec_b.histograms.size() == sample_b.tree().node_count() );
409  assert( hist_vec_a.histograms.size() == hist_vec_b.histograms.size() );
410 
411  return node_histogram_distance( hist_vec_a, hist_vec_b );
412 }
413 
414 // -------------------------------------------------------------------------------------------------
415 // Sample Set
416 // -------------------------------------------------------------------------------------------------
417 
421 std::vector<NodeDistanceHistogramSet> node_distance_histogram_set(
422  SampleSet const& sample_set,
423  size_t const histogram_bins
424 ) {
425  auto const set_size = sample_set.size();
426 
427  // Edge case.
428  if( set_size == 0 ) {
429  return {};
430  }
431 
432  // Prepare lookup for the trees. This assumes identical trees for all samples.
433  auto const node_distances = node_branch_length_distance_matrix( sample_set[0].sample.tree() );
434  auto const node_sides = node_root_direction_matrix( sample_set[0].sample.tree() );
435 
436  // Prepare histograms for all samples, by copying empty histograms for the first sample.
437  auto const empty_hist = make_empty_node_distance_histogram_set(
438  sample_set[ 0 ].sample.tree(), node_distances, node_sides, histogram_bins
439  );
440  auto result = std::vector<NodeDistanceHistogramSet>( set_size, empty_hist );
441 
442  // Calculate the histograms for all samples.
443  #pragma omp parallel for schedule(dynamic)
444  for( size_t i = 0; i < set_size; ++i ) {
445 
446  // Check compatibility.
447  // It suffices to check adjacent pairs of samples, as compatibility is transitive.
448  if( i > 0 ) {
449  if( ! compatible_trees( sample_set[ i - 1 ].sample, sample_set[ i ].sample )) {
450  throw std::invalid_argument(
451  "Trees in SampleSet not compatible for calculating Node Histogram Distance."
452  );
453  }
454  }
455 
456  // Fill the histograms for every node of the sample.
458  sample_set[ i ].sample, node_distances, node_sides, result[ i ]
459  );
460  assert( result[ i ].histograms.size() == sample_set[ i ].sample.tree().node_count() );
461  }
462 
463  return result;
464 }
465 
467  SampleSet const& sample_set,
468  size_t const histogram_bins
469 ) {
470  // Get the histograms and calculate the distance.
471  auto const hist_vecs = node_distance_histogram_set( sample_set, histogram_bins );
472  return node_histogram_distance( hist_vecs );
473 }
474 
475 } // namespace placement
476 } // namespace genesis
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num, as defined in the jplace standard.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
bool compatible_trees(PlacementTree const &lhs, PlacementTree const &rhs)
Return whether two PlacementTrees are compatible.
std::vector< double > bins
Definition: nhd.hpp:79
utils::Matrix< signed char > node_root_direction_matrix(Tree const &tree)
Calculate a Matrix that indicates the nodes on the root side of a given node.
std::vector< NodeDistanceHistogram > histograms
Definition: nhd.hpp:87
void fill_node_distance_histogram_set(Sample const &sample, utils::Matrix< double > const &node_distances, utils::Matrix< signed char > const &node_sides, NodeDistanceHistogramSet &histogram_set)
Fill the placements of a Sample into Histograms.
Definition: nhd.cpp:144
Provides functions for working with Placements and Pqueries.
NodeDistanceHistogramSet node_distance_histogram_set(Sample const &sample, utils::Matrix< double > const &node_distances, utils::Matrix< signed char > const &node_sides, size_t const histogram_bins)
Calcualte the NodeDistanceHistogramSet representing a single Sample, given the necessary matrices of ...
Definition: nhd.cpp:251
double sum(const Histogram &h)
double node_histogram_distance(NodeDistanceHistogram const &lhs, NodeDistanceHistogram const &rhs)
Definition: nhd.cpp:269
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
Matrix operators.
Collection of NodeDistanceHistograms that describes one Sample.
Definition: nhd.hpp:85
double total_multiplicity(Pquery const &pqry)
Return the sum of all multiplicities of the Pquery.
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
size_t triangular_size(size_t n)
Calculate the number of linear indices needed for a triangular Matrix of size n.
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.cpp:222
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
Header of Default Tree distance methods.
Header of PqueryPlain class.
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.hpp:234
Simple histogram data structure with equal sized bins.
Definition: nhd.hpp:75
double branch_length
Branch length of the edge.
std::pair< size_t, size_t > triangular_indices(size_t k, size_t n)
Given a linear index in a upper triangular Matrix, find the corresponding Matrix indices.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
size_t node_count(Tree const &tree)
Return the number of Nodes of a Tree. Same as Tree::node_count().
Header of Tree distance methods.
NodeDistanceHistogramSet make_empty_node_distance_histogram_set(tree::Tree const &tree, utils::Matrix< double > const &node_distances, utils::Matrix< signed char > const &node_sides, size_t const histogram_bins)
Create a set of Histograms without any weights for a given Tree.
Definition: nhd.cpp:70
size_t total_placement_count(Sample const &smp)
Get the total number of PqueryPlacements in all Pqueries of the given Sample.