A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
serializer.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <stdexcept>
40 
41 #ifdef GENESIS_OPENMP
42 # include <omp.h>
43 #endif
44 
45 namespace genesis {
46 namespace placement {
47 
48 // =================================================================================================
49 // Version
50 // =================================================================================================
51 
56 unsigned char SampleSerializer::version = 1;
57 
58 // =================================================================================================
59 // Save
60 // =================================================================================================
61 
65 void SampleSerializer::save( Sample const& map, std::string const& file_name )
66 {
67  // Prepare.
68  utils::Serializer ser (file_name);
69  if (!ser) {
70  throw std::invalid_argument("Serialization failed.");
71  }
72 
73  // Write header.
74  char magic[] = "BPLACE\0\0";
75  ser.put_raw(magic, 8);
76  ser.put_int<unsigned char>(version);
77 
78  // Write tree.
79  // TODO if there is a tree serialization in the future, this one could be used here, and in
80  // addition to edge numbers, the edge indices can be stored, so that deserialization is easier.
81  auto nw = PlacementTreeNewickWriter();
82  nw.enable_names(true);
83  nw.enable_branch_lengths(true);
84  ser.put_string(nw.to_string(map.tree()));
85 
86  // Write pqueries.
87  ser.put_int(map.size());
88  for (auto& pqry : map.pqueries()) {
89 
90  // Write placements.
91  ser.put_int(pqry.placement_size());
92  for( auto const& place : pqry.placements() ) {
93  // We set the edge index instead of edge num. This is faster, simpler to resorte, and
94  // consinstend with Pquery.add_placement() parameters.
95  ser.put_int (place.edge().index());
96 
97  ser.put_float (place.likelihood);
98  ser.put_float (place.like_weight_ratio);
99  ser.put_float (place.proximal_length);
100  ser.put_float (place.pendant_length);
101  ser.put_int (place.parsimony);
102  }
103 
104  // Write names.
105  ser.put_int(pqry.name_size());
106  for( auto const& name : pqry.names() ) {
107  ser.put_string (name.name);
108  ser.put_float (name.multiplicity);
109  }
110  }
111 }
112 
113 // =================================================================================================
114 // Load
115 // =================================================================================================
116 
120 Sample SampleSerializer::load( std::string const& file_name )
121 {
122  // Create returned object.
123  Sample map;
124 
125  // Prepare, check stream status.
126  utils::Deserializer des( file_name );
127  if( ! des ) {
128  throw std::invalid_argument( "Deserialization failed: Cannot open file." );
129  }
130 
131  // Read and check header.
132  std::string magic = des.get_raw_string(8);
133  if (strncmp (magic.c_str(), "BPLACE\0\0", 8) != 0) {
134  throw std::invalid_argument("Wrong file format: \"" + magic + "\".");
135  }
136  auto ver = des.get_int<unsigned char>();
137  if (ver != version) {
138  throw std::invalid_argument("Wrong serialization version: " + std::to_string(ver));
139  }
140 
141  // Read and check tree.
142  auto tree_string = des.get_string();
143  map.tree() = PlacementTreeNewickReader().from_string( tree_string );
144 
145  // Read pqueries.
146  size_t num_pqueries = des.get_int<size_t>();
147  for (size_t i = 0; i < num_pqueries; ++i) {
148  Pquery& pqry = map.add();
149 
150  // Read placements.
151  size_t num_place = des.get_int<size_t>();
152  for (size_t p = 0; p < num_place; ++p) {
153  // Get edge index, add the placement there.
154  size_t edge_idx = des.get_int<size_t>();
155  auto& edge = map.tree().edge_at( edge_idx );
156  auto& place = pqry.add_placement( edge );
157 
158  place.likelihood = des.get_float<double>();
159  place.like_weight_ratio = des.get_float<double>();
160  place.proximal_length = des.get_float<double>();
161  place.pendant_length = des.get_float<double>();
162  place.parsimony = des.get_int<int>();
163  }
164 
165  // Read names.
166  size_t num_names = des.get_int<size_t>();
167  for (size_t n = 0; n < num_names; ++n) {
168  auto name = pqry.add_name( des.get_string() );
169  name.multiplicity = des.get_float<double>();
170  }
171  }
172 
173  if (!des.finished()) {
174  throw std::invalid_argument("Deserialization failed: File longer than expected.");
175  }
176 
177  return map;
178 }
179 
180 SampleSet SampleSerializer::load( std::vector<std::string> const& file_names )
181 {
182  SampleSet sample_set;
183  load( file_names, sample_set );
184  return sample_set;
185 }
186 
187 void SampleSerializer::load( std::vector<std::string> const& file_names, SampleSet& sample_set )
188 {
189  #if defined( GENESIS_OPENMP )
190 
191  // Make a vector of default-constructed Samples of the needed size.
192  // We do this so that the order of input jplace files is kept.
193  auto tmp = std::vector<Sample>( file_names.size() );
194 
195  // Parallel loading.
196  #pragma omp parallel for
197  for( size_t i = 0; i < file_names.size(); ++i ) {
198  tmp[ i ] = load( file_names[i] );
199  }
200 
201  // Move to target SampleSet.
202  for( size_t i = 0; i < file_names.size(); ++i ) {
203  auto const name = utils::file_filename( utils::file_basename( file_names[i] ) );
204  sample_set.add( std::move( tmp[i] ), name );
205  }
206 
207  #else
208 
209  for( auto const& fn : file_names ) {
210  auto const name = utils::file_filename( utils::file_basename(fn) );
211  sample_set.add( load( fn ), name );
212  }
213 
214  #endif
215 }
216 
217 } // namespace placement
218 } // namespace genesis
size_t size() const
Return the number of Pqueries that are stored in this Sample.
Definition: sample.cpp:133
static unsigned char version
Version of this serialization helper. Is written to the stream and read again to make sure that diffe...
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
static Sample load(std::string const &file_name)
Loads a Sample from a binary file that was written by using save().
Definition: serializer.cpp:120
std::string file_filename(std::string const &filename)
Remove extension if present.
Definition: fs.cpp:296
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
PqueryName & add_name(std::string name="", double multiplicity=1.0)
Create a new PqueryName using the provided parameters, add it to the Pquery and return it...
Definition: pquery.cpp:181
std::string get_string()
Read a string from the stream, provided that its length it written preceding it, as done by put_strin...
Tree from_string(std::string const &tree_string) const
Read a Tree from a string containing a Newick tree.
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:381
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
void put_raw(char const *data, size_t n)
Write raw data, provided as a char array of length n, to the stream.
void put_float(const T v)
Write a floating point number to the stream.
Header of Serializer and Deserializer class.
double multiplicity
Multiplicity of the name.
Definition: name.hpp:131
PqueryPlacement & add_placement(PlacementTreeEdge &edge)
Create a new PqueryPlacement at a given PlacementTreeEdge, add it to the Pquery and return it...
Definition: pquery.cpp:92
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
std::string get_raw_string(size_t n)
Read n bytes from the stream and return them as a string.
std::string file_basename(std::string const &filename)
Remove directory name from file name if present.
Definition: fs.cpp:285
void put_string(const std::string &v)
Write a string, preceded by its length, to the stream. Use get_string() to read it.
void add(Sample const &smp)
Add a Sample to the SampleSet.
Definition: sample_set.hpp:113
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
void put_int(const T v)
Write an integer number to the stream.
T get_int()
Read an integer number from the stream and return it.
Header of SampleSerializer class.
T get_float()
Read a floating point number from the stream and return it.
static void save(Sample const &map, std::string const &file_name)
Saves the Sample to a binary file that can later be read by using load().
Definition: serializer.cpp:65
Pquery & add()
Create an empty Pquery, add it to the Sample and return it.
Definition: sample.cpp:147
double likelihood
Total likelihood of the tree with this placement attached to it.