A toolkit for working with phylogenetic data.
v0.24.0
sample_set.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <ostream>
40 
41 namespace genesis {
42 namespace placement {
43 
44 // =================================================================================================
45 // Sample Set Functions
46 // =================================================================================================
47 
48 Sample* find_sample( SampleSet& sample_set, std::string const& name )
49 {
50  for( size_t i = 0; i < sample_set.size(); ++i ) {
51  if( sample_set.name_at(i) == name ) {
52  return &sample_set[i];
53  }
54  }
55  return nullptr;
56 }
57 
58 Sample const* find_sample( SampleSet const& sample_set, std::string const& name )
59 {
60  for( size_t i = 0; i < sample_set.size(); ++i ) {
61  if( sample_set.name_at(i) == name ) {
62  return &sample_set[i];
63  }
64  }
65  return nullptr;
66 }
67 
68 Sample merge_all( SampleSet const& sample_set )
69 {
70  // The following operations do a lot of traversals on all trees: first some for the
71  // average_branch_length_tree, then for the merging again. This could be turned into
72  // less traversals by copying code and doing all in one run. However, at the current point, this
73  // method will be called once in the beginning of a program run, and thus it is not necessary to
74  // optimize for speed. Instead, we opt for clean, separated and easy code here.
75 
76  if( sample_set.size() == 0 ) {
77  return Sample();
78  }
79 
80  // Create a new Sample and initialize it with the average branch length tree of all
81  // maps in this set, but without any placements.
82  auto res = Sample( average_branch_length_tree( sample_set ));
83 
84  // Copy the rest of the data from the first tree to the averaged tree.
85  // This is necessary, because the tree copy constructor does not do this for us.
86  // TODO fix this!
87  for (size_t i = 0; i < res.tree().node_count(); ++i) {
88  res.tree().node_at(i).data<PlacementNodeData>().name
89  = sample_set[0].tree().node_at(i).data<PlacementNodeData>().name;
90  }
91  for (size_t i = 0; i < res.tree().edge_count(); ++i) {
92  res.tree().edge_at(i).data<PlacementEdgeData>().reset_edge_num(
93  sample_set[0].tree().edge_at(i).data<PlacementEdgeData>().edge_num()
94  );
95  }
96 
97  // Add the placements from all maps of this set.
98  // In the merge method, we also check for identical topology (again), but mainly for identical
99  // taxa names and edge_nums, which is important for correct merging.
100  for (auto& smp : sample_set) {
101  copy_pqueries( smp, res );
102  }
103 
104  return res;
105 }
106 
107 size_t total_pquery_count( SampleSet const& sample_set )
108 {
109  size_t s = 0;
110  for( auto const& sample : sample_set ) {
111  s += sample.size();
112  }
113  return s;
114 }
115 
116 // =================================================================================================
117 // Tree Functions
118 // =================================================================================================
119 
121 {
122  return average_branch_length_tree( tree_set( sample_set ));
123 }
124 
125 bool all_identical_trees( SampleSet const& sample_set )
126 {
127  auto node_comparator = [] (
128  PlacementTreeNode const& node_l,
129  PlacementTreeNode const& node_r
130  ) {
131  auto l_ptr = dynamic_cast< PlacementNodeData const* >( node_l.data_ptr() );
132  auto r_ptr = dynamic_cast< PlacementNodeData const* >( node_r.data_ptr() );
133  if( l_ptr == nullptr || r_ptr == nullptr ) {
134  return false;
135  }
136  return (( l_ptr->name == r_ptr->name ) && ( node_l.index() == node_r.index() ));
137  };
138 
139  auto edge_comparator = [] (
140  PlacementTreeEdge const& edge_l,
141  PlacementTreeEdge const& edge_r
142  ) {
143  auto l_ptr = dynamic_cast< PlacementEdgeData const* >( edge_l.data_ptr() );
144  auto r_ptr = dynamic_cast< PlacementEdgeData const* >( edge_r.data_ptr() );
145  if( l_ptr == nullptr || r_ptr == nullptr ) {
146  return false;
147  }
148  return ( l_ptr->edge_num() == r_ptr->edge_num() ) &&
149  ( edge_l.primary_node().index() == edge_r.primary_node().index() ) &&
150  ( edge_l.secondary_node().index() == edge_r.secondary_node().index());
151  };
152 
153  return equal( tree_set( sample_set ), node_comparator, edge_comparator );
154 }
155 
156 tree::TreeSet tree_set( SampleSet const& sample_set )
157 {
158  tree::TreeSet tset;
159  for( size_t i = 0; i < sample_set.size(); ++i ) {
160  tset.add( sample_set[i].tree(), sample_set.name_at(i) );
161  }
162  return tset;
163 }
164 
165 void adjust_branch_lengths( SampleSet& sample_set, tree::Tree const& source )
166 {
167  for( auto& smp : sample_set ) {
168  adjust_branch_lengths( smp, source );
169  }
170 }
171 
173 {
174  adjust_branch_lengths( sample_set, average_branch_length_tree( sample_set ));
175 }
176 
177 // =================================================================================================
178 // Output
179 // =================================================================================================
180 
181 std::ostream& operator << ( std::ostream& out, SampleSet const& sample_set )
182 {
183  // TODO this was meant for full output. turn it into a printer instead!
184  bool full = false;
185 
186  for( size_t i = 0; i < sample_set.size(); ++i ) {
187  out << std::to_string(i) << ": " << sample_set.name_at(i) << "\n";
188  if (full) {
189  out << sample_set[i] << "\n";
190  }
191  }
192  return out;
193 }
194 
195 } // namespace placement
196 } // namespace genesis
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.hpp:234
void add(Tree const &tree, std::string const &name="")
Add a Tree with a name to the TreeSet.
Definition: tree_set.hpp:88
Data class for PlacementTreeEdges. Stores the branch length of the edge, and the edge_num, as defined in the jplace standard.
void copy_pqueries(Sample const &source, Sample &target)
Copy all Pqueries from the source Sample (left parameter) to the target Sample (right parameter)...
CommonTree functions.
Provides functions for working with Placements and Pqueries.
Sample merge_all(SampleSet const &sample_set)
Returns a Sample where all Samples of a SampleSet have been merged into.
Definition: sample_set.cpp:68
TreeNode & secondary_node()
Return the TreeNode of this TreeEdge that points away from the root.
Definition: edge.hpp:162
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
tree::Tree average_branch_length_tree(SampleSet const &sample_set)
Return the Tree that has edges with the average branch length of the respective edges of the Trees in...
Definition: sample_set.cpp:120
TreeNode & primary_node()
Return the TreeNode of this TreeEdge that points towards the root.
Definition: edge.hpp:146
tree::TreeSet tree_set(SampleSet const &sample_set)
Return a TreeSet containing all the trees of the SampleSet.
Definition: sample_set.cpp:156
Sample * find_sample(SampleSet &sample_set, std::string const &name)
Get the first Sample in a SampleSet that has a given name, or nullptr if not found.
Definition: sample_set.cpp:48
BaseEdgeData * data_ptr()
Return a pointer to the data.
Definition: edge.hpp:246
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
void adjust_to_average_branch_lengths(SampleSet &sample_set)
Set the branch lengths of all Samples in the sample_set to the respecitve average branch length of th...
Definition: sample_set.cpp:172
bool all_identical_trees(SampleSet const &sample_set)
Returns true iff all Trees of the Samples in the set are identical.
Definition: sample_set.cpp:125
Store a set of Samples with associated names.
Definition: sample_set.hpp:54
std::string const & name_at(size_t index) const
Definition: sample_set.hpp:144
void adjust_branch_lengths(Sample &sample, tree::Tree const &source)
Take the branch lengths of the source Tree and use them as the new branch lengths of the sample...
size_t total_pquery_count(SampleSet const &sample_set)
Return the total number of Pqueries in the Samples of the SampleSet.
Definition: sample_set.cpp:107
Data class for PlacementTreeNodes. Stores a node name.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
size_t index() const
Return the index of this Node.
Definition: node.hpp:102
bool equal(Tree const &lhs, Tree const &rhs, std::function< bool(TreeNode const &, TreeNode const &) > node_comparator, std::function< bool(TreeEdge const &, TreeEdge const &) > edge_comparator)
Compare two trees for equality given binary comparator functionals for their nodes and edges...
BaseNodeData * data_ptr()
Return a pointer to the data.
Definition: node.hpp:232
std::ostream & operator<<(std::ostream &out, Sample const &smp)
Print a table of all Pqueries with their Placements and Names to the stream.