A toolkit for working with phylogenetic data.
v0.24.0
serializer.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
39 
40 #include <stdexcept>
41 
42 #ifdef GENESIS_OPENMP
43 # include <omp.h>
44 #endif
45 
46 namespace genesis {
47 namespace placement {
48 
49 // =================================================================================================
50 // Version
51 // =================================================================================================
52 
57 unsigned char SampleSerializer::version = 1;
58 
59 // =================================================================================================
60 // Save
61 // =================================================================================================
62 
66 void SampleSerializer::save( Sample const& map, std::string const& file_name )
67 {
68  // Prepare.
69  utils::Serializer ser (file_name);
70  if (!ser) {
71  throw std::invalid_argument("Serialization failed.");
72  }
73 
74  // Write header.
75  char magic[] = "BPLACE\0\0";
76  ser.put_raw(magic, 8);
77  ser.put_int<unsigned char>(version);
78 
79  // Write tree.
80  // TODO if there is a tree serialization in the future, this one could be used here, and in
81  // addition to edge numbers, the edge indices can be stored, so that deserialization is easier.
82  auto nw = PlacementTreeNewickWriter();
83  nw.enable_names(true);
84  nw.enable_branch_lengths(true);
85  std::string tree;
86  nw.write( map.tree(), utils::to_string( tree ));
87  ser.put_string( std::move( tree ));
88 
89  // Write pqueries.
90  ser.put_int(map.size());
91  for (auto& pqry : map.pqueries()) {
92 
93  // Write placements.
94  ser.put_int(pqry.placement_size());
95  for( auto const& place : pqry.placements() ) {
96  // We set the edge index instead of edge num. This is faster, simpler to resorte, and
97  // consinstend with Pquery.add_placement() parameters.
98  ser.put_int (place.edge().index());
99 
100  ser.put_float( place.likelihood );
101  ser.put_float( place.like_weight_ratio );
102  ser.put_float( place.proximal_length );
103  ser.put_float( place.pendant_length );
104  }
105 
106  // Write names.
107  ser.put_int(pqry.name_size());
108  for( auto const& name : pqry.names() ) {
109  ser.put_string (name.name);
110  ser.put_float (name.multiplicity);
111  }
112  }
113 }
114 
115 // =================================================================================================
116 // Load
117 // =================================================================================================
118 
122 Sample SampleSerializer::load( std::string const& file_name )
123 {
124  // Create returned object.
125  Sample map;
126 
127  // Prepare, check stream status.
128  utils::Deserializer des( file_name );
129  if( ! des ) {
130  throw std::invalid_argument( "Deserialization failed: Cannot open file." );
131  }
132 
133  // Read and check header.
134  std::string magic = des.get_raw_string(8);
135  if (strncmp (magic.c_str(), "BPLACE\0\0", 8) != 0) {
136  throw std::invalid_argument("Wrong file format: \"" + magic + "\".");
137  }
138  auto ver = des.get_int<unsigned char>();
139  if (ver != version) {
140  throw std::invalid_argument("Wrong serialization version: " + std::to_string(ver));
141  }
142 
143  // Read and check tree.
144  auto tree_string = des.get_string();
145  map.tree() = PlacementTreeNewickReader().read( utils::from_string( tree_string ));
146 
147  // Read pqueries.
148  size_t num_pqueries = des.get_int<size_t>();
149  for (size_t i = 0; i < num_pqueries; ++i) {
150  Pquery& pqry = map.add();
151 
152  // Read placements.
153  size_t num_place = des.get_int<size_t>();
154  for (size_t p = 0; p < num_place; ++p) {
155  // Get edge index, add the placement there.
156  size_t edge_idx = des.get_int<size_t>();
157  auto& edge = map.tree().edge_at( edge_idx );
158  auto& place = pqry.add_placement( edge );
159 
160  place.likelihood = des.get_float<double>();
161  place.like_weight_ratio = des.get_float<double>();
162  place.proximal_length = des.get_float<double>();
163  place.pendant_length = des.get_float<double>();
164  }
165 
166  // Read names.
167  size_t num_names = des.get_int<size_t>();
168  for (size_t n = 0; n < num_names; ++n) {
169  auto name = pqry.add_name( des.get_string() );
170  name.multiplicity = des.get_float<double>();
171  }
172  }
173 
174  if (!des.finished()) {
175  throw std::invalid_argument("Deserialization failed: File longer than expected.");
176  }
177 
178  return map;
179 }
180 
181 SampleSet SampleSerializer::load( std::vector<std::string> const& file_names )
182 {
183  SampleSet sample_set;
184  load( file_names, sample_set );
185  return sample_set;
186 }
187 
188 void SampleSerializer::load( std::vector<std::string> const& file_names, SampleSet& sample_set )
189 {
190  #if defined( GENESIS_OPENMP )
191 
192  // Make a vector of default-constructed Samples of the needed size.
193  // We do this so that the order of input jplace files is kept.
194  auto tmp = std::vector<Sample>( file_names.size() );
195 
196  // Parallel loading.
197  #pragma omp parallel for
198  for( size_t i = 0; i < file_names.size(); ++i ) {
199  tmp[ i ] = load( file_names[i] );
200  }
201 
202  // Move to target SampleSet.
203  for( size_t i = 0; i < file_names.size(); ++i ) {
204  auto const name = utils::file_filename( utils::file_basename( file_names[i] ) );
205  sample_set.add( std::move( tmp[i] ), name );
206  }
207 
208  #else
209 
210  for( auto const& fn : file_names ) {
211  auto const name = utils::file_filename( utils::file_basename(fn) );
212  sample_set.add( load( fn ), name );
213  }
214 
215  #endif
216 }
217 
218 } // namespace placement
219 } // namespace genesis
static unsigned char version
Version of this serialization helper. Is written to the stream and read again to make sure that diffe...
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
static Sample load(std::string const &file_name)
Loads a Sample from a binary file that was written by using save().
Definition: serializer.cpp:122
Tree read(std::shared_ptr< utils::BaseInputSource > source) const
Read a single Tree from an input source containing a Newick tree.
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
PqueryName & add_name(std::string name="", double multiplicity=1.0)
Create a new PqueryName using the provided parameters, add it to the Pquery and return it...
Definition: pquery.cpp:181
std::string get_string()
Read a string from the stream, provided that its length it written preceding it, as done by put_strin...
std::string file_filename(std::string const &filename)
Remove extension if present.
Definition: fs.cpp:696
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
void put_raw(char const *data, size_t n)
Write raw data, provided as a char array of length n, to the stream.
void put_float(const T v)
Write a floating point number to the stream.
std::string file_basename(std::string const &filename)
Remove directory name from file name if present.
Definition: fs.cpp:685
Header of Serializer and Deserializer class.
double multiplicity
Multiplicity of the name.
Definition: name.hpp:131
void add(Sample const &smp, std::string const &name="")
Add a Sample with a name to the SampleSet.
Definition: sample_set.hpp:94
PqueryPlacement & add_placement(PlacementTreeEdge &edge)
Create a new PqueryPlacement at a given PlacementTreeEdge, add it to the Pquery and return it...
Definition: pquery.cpp:92
Store a set of Samples with associated names.
Definition: sample_set.hpp:54
std::string get_raw_string(size_t n)
Read n bytes from the stream and return them as a string.
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:133
Header of Serializer and Deserializer class.
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
void put_string(const std::string &v)
Write a string, preceded by its length, to the stream. Use get_string() to read it.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
void put_int(const T v)
Write an integer number to the stream.
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
T get_int()
Read an integer number from the stream and return it.
Header of SampleSerializer class.
T get_float()
Read a floating point number from the stream and return it.
static void save(Sample const &map, std::string const &file_name)
Saves the Sample to a binary file that can later be read by using load().
Definition: serializer.cpp:66
Pquery & add()
Create an empty Pquery, add it to the Sample and return it.
Definition: sample.cpp:147
double likelihood
Total likelihood of the tree with this placement attached to it.
std::shared_ptr< BaseInputSource > from_string(std::string const &input_string)
Obtain an input source for reading from a string.