A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
jplace_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
50 
51 #include <algorithm>
52 #include <assert.h>
53 #include <cctype>
54 #include <functional>
55 #include <sstream>
56 #include <stdexcept>
57 #include <string>
58 #include <utility>
59 #include <vector>
60 
61 #ifdef GENESIS_OPENMP
62 # include <omp.h>
63 #endif
64 
65 namespace genesis {
66 namespace placement {
67 
68 // =================================================================================================
69 // Reading
70 // =================================================================================================
71 
72 // -------------------------------------------------------------------------
73 // Reading from Stream
74 // -------------------------------------------------------------------------
75 
76 Sample JplaceReader::from_stream( std::istream& is ) const
77 {
78  auto doc = utils::JsonReader().from_stream( is );
79  return from_document( doc );
80 }
81 
82 // -------------------------------------------------------------------------
83 // Reading from File
84 // -------------------------------------------------------------------------
85 
86 Sample JplaceReader::from_file( std::string const& fn ) const
87 {
88  auto doc = utils::JsonReader().from_file( fn );
89  return from_document( doc );
90 }
91 
92 // -------------------------------------------------------------------------
93 // Reading from String
94 // -------------------------------------------------------------------------
95 
96 Sample JplaceReader::from_string( std::string const& jplace ) const
97 {
98  auto doc = utils::JsonReader().from_string( jplace );
99  return from_document( doc );
100 }
101 
102 // -------------------------------------------------------------------------
103 // Reading from Document
104 // -------------------------------------------------------------------------
105 
107 {
108  Sample smp;
109 
110  if( ! doc.is_object() ) {
111  throw std::runtime_error( "Json value is not a Json document." );
112  }
113 
114  process_json_version( doc );
115  process_json_metadata( doc, smp );
116 
117  process_json_tree( doc, smp );
118  auto fields = process_json_fields( doc );
119  process_json_placements( doc, smp, fields);
120 
121  return smp;
122 }
123 
124 // -------------------------------------------------------------------------
125 // Reading from Files
126 // -------------------------------------------------------------------------
127 
128 SampleSet JplaceReader::from_files (const std::vector<std::string>& fns ) const
129 {
130  SampleSet set;
131  from_files( fns, set );
132  return set;
133 }
134 
135 // -------------------------------------------------------------------------
136 // Reading from Strings
137 // -------------------------------------------------------------------------
138 
139 SampleSet JplaceReader::from_strings (const std::vector<std::string>& jps ) const
140 {
141  SampleSet set;
142  from_files( jps, set );
143  return set;
144 }
145 
146 // -------------------------------------------------------------------------
147 // Reading from Files
148 // -------------------------------------------------------------------------
149 
150 void JplaceReader::from_files( std::vector<std::string> const& fns, SampleSet& set ) const
151 {
152  #if defined( GENESIS_OPENMP )
153 
154  // Make a vector of default-constructed Samples of the needed size.
155  // We do this so that the order of input jplace files is kept.
156  auto tmp = std::vector<Sample>( fns.size() );
157 
158  // Parallel parsing.
159  #pragma omp parallel for
160  for( size_t i = 0; i < fns.size(); ++i ) {
161  tmp[ i ] = from_file( fns[i] );
162  }
163 
164  // Move to target SampleSet.
165  for( size_t i = 0; i < fns.size(); ++i ) {
166  auto const name = utils::file_filename( utils::file_basename( fns[i] ) );
167  set.add( std::move( tmp[i] ), name );
168  }
169 
170  #else
171 
172  for( auto const& fn : fns ) {
173  auto const name = utils::file_filename( utils::file_basename(fn) );
174  set.add( from_file( fn ), name );
175  }
176 
177  #endif
178 }
179 
180 // -------------------------------------------------------------------------
181 // Reading from Strings
182 // -------------------------------------------------------------------------
183 
184 void JplaceReader::from_strings( std::vector<std::string> const& jps, SampleSet& set ) const
185 {
186  #if defined( GENESIS_OPENMP )
187 
188  // Make a vector of default-constructed Samples of the needed size.
189  // We do this so that the order of input jplace files is kept.
190  auto tmp = std::vector<Sample>( jps.size() );
191 
192  // Parallel parsing.
193  #pragma omp parallel for
194  for( size_t i = 0; i < jps.size(); ++i ) {
195  tmp[ i ] = from_string( jps[i] );
196  }
197 
198  // Move to target SampleSet.
199  size_t cnt = set.size();
200  for( size_t i = 0; i < jps.size(); ++i ) {
201  auto const name = std::string("jplace_") + std::to_string(cnt);
202  set.add( std::move( tmp[i] ), name );
203  ++cnt;
204  }
205 
206  #else
207 
208  size_t cnt = set.size();
209  for( auto const& jplace : jps ) {
210  set.add( from_string( jplace ), std::string("jplace_") + std::to_string(cnt) );
211  ++cnt;
212  }
213 
214  #endif
215 }
216 
217 // =================================================================================================
218 // Processing
219 // =================================================================================================
220 
221 // -------------------------------------------------------------------------
222 // Processing Version
223 // -------------------------------------------------------------------------
224 
225 void JplaceReader::process_json_version( utils::JsonDocument const& doc ) const
226 {
227  // Check if there is a version key.
228  auto v_it = doc.find( "version" );
229  if( v_it == doc.end() || !( v_it->is_string() || v_it->is_number() ) ) {
230  LOG_WARN << "Jplace document does not contain a valid version number at key 'version'. "
231  << "Now continuing to parse in the hope that it still works.";
232  return;
233  }
234 
235  // The key can be a number (3) or a string ("3"), check both.
236  std::string doc_version;
237  if( v_it->is_number() ) {
238  if( v_it->is_number_float() ) {
239  doc_version = std::to_string( v_it->get_number_float() );
240  } else {
241  doc_version = std::to_string( v_it->get_number_unsigned() );
242  }
243  }
244  if( v_it->is_string() ) {
245  doc_version = v_it->get_string();
246  }
247 
248  // Check if the version is correct.
249  if( ! check_version( doc_version )) {
250  LOG_WARN << "Jplace document has version '" << doc_version << "', however this parser "
251  << "is written for version " << version() << " of the Jplace format. "
252  << "Now continuing to parse in the hope that it still works.";
253  }
254 }
255 
256 // -------------------------------------------------------------------------
257 // Processing Metadata
258 // -------------------------------------------------------------------------
259 
260 void JplaceReader::process_json_metadata( utils::JsonDocument const& doc, Sample& smp ) const
261 {
262  // Check if there is metadata.
263  auto meta_it = doc.find( "metadata" );
264  if( meta_it != doc.end() && meta_it->is_object() ) {
265  for( auto it = meta_it->begin(); it != meta_it->end(); ++it ) {
266 
267  // Only use metadata that is a string. Everything else is ignored.
268  if( it.value().is_string() ) {
269  smp.metadata[ it.key() ] = it.value().get_string();
270  }
271  }
272  }
273 }
274 
275 // -------------------------------------------------------------------------
276 // Processing Tree
277 // -------------------------------------------------------------------------
278 
279 void JplaceReader::process_json_tree( utils::JsonDocument const& doc, Sample& smp ) const
280 {
281  // Find and process the reference tree.
282  auto tree_it = doc.find( "tree" );
283  if( tree_it == doc.end() || ! tree_it->is_string()
284  ) {
285  throw std::runtime_error(
286  "Jplace document does not contain a valid Newick tree at key 'tree'."
287  );
288  }
289  smp.tree() = PlacementTreeNewickReader().from_string( tree_it->get_string() );
290  if( ! has_correct_edge_nums( smp.tree() )) {
291  LOG_WARN << "Jplace document has a Newick tree where the edge_num tags are non standard. "
292  << "They are expected to be assigned in ascending order via postorder traversal. "
293  << "Now continuing to parse, as we can cope with this.";
294  }
295 }
296 
297 // -------------------------------------------------------------------------
298 // Processing Fields
299 // -------------------------------------------------------------------------
300 
301 std::vector<std::string> JplaceReader::process_json_fields( utils::JsonDocument const& doc ) const
302 {
303  // get the field names and store them in array fields
304  auto fields_it = doc.find( "fields" );
305 
306  if( fields_it == doc.end() || ! fields_it->is_array() ) {
307  throw std::runtime_error( "Jplace document does not contain field names at key 'fields'." );
308  }
309  std::vector<std::string> fields;
310  bool has_edge_num = false;
311  for( auto const& fields_val : *fields_it ) {
312  if( ! fields_val.is_string() ) {
313  throw std::runtime_error(
314  "Jplace document contains a value of type '" + fields_val.type_name()
315  + "' instead of a string with a field name at key 'fields'."
316  );
317  }
318 
319  // check field validity
320  std::string const& field = fields_val.get_string();
321  if (field == "edge_num" || field == "likelihood" || field == "like_weight_ratio" ||
322  field == "distal_length" || field == "pendant_length" || field == "proximal_length" ||
323  field == "parsimony"
324  ) {
325  for (std::string fn : fields) {
326  if (fn == field) {
327  throw std::runtime_error(
328  "Jplace document contains field name '" + field
329  + "' more than once at key 'fields'."
330  );
331  }
332  }
333  } else {
334  LOG_WARN << "Jplace document contains a field name '" << field << "' "
335  << "at key 'fields', which is not used by this parser and thus ignored.";
336  }
337  fields.push_back(field);
338  has_edge_num |= (field == "edge_num");
339  }
340  if (!has_edge_num) {
341  throw std::runtime_error(
342  "Jplace document does not contain necessary field 'edge_num' at key 'fields'."
343  );
344  }
345  if (
346  std::end(fields) != std::find(std::begin(fields), std::end(fields), "distal_length") &&
347  std::end(fields) != std::find(std::begin(fields), std::end(fields), "proximal_length")
348  ) {
349  LOG_WARN << "Jplace document contains both fields 'distal_length', and 'proximal_length'. "
350  << "Currently, only one value is used internally to represent both, which might "
351  << "lead to inconsistency if the sum of both is not equal to the branch length.";
352  }
353 
354  return fields;
355 }
356 
357 // -------------------------------------------------------------------------
358 // Processing Placements
359 // -------------------------------------------------------------------------
360 
361 void JplaceReader::process_json_placements(
362  utils::JsonDocument& doc,
363  Sample& smp,
364  std::vector<std::string> fields
365 ) const {
366  // create a map from edge nums to the actual edge pointers, for later use when processing
367  // the pqueries. we do not use Sample::EdgeNumMap() here, because we need to do extra
368  // checking for validity first!
369  std::unordered_map<size_t, PlacementTreeEdge*> edge_num_map;
370  for (
371  PlacementTree::ConstIteratorEdges it = smp.tree().begin_edges();
372  it != smp.tree().end_edges();
373  ++it
374  ) {
375  auto& edge = *it;
376  auto& edge_data = edge->data<PlacementEdgeData>();
377  if (edge_num_map.count( edge_data.edge_num()) > 0) {
378  throw std::runtime_error(
379  "Jplace document contains a tree where the edge_num tag '"
380  + std::to_string( edge_data.edge_num() ) + "' is used more than once."
381  );
382  }
383  edge_num_map.emplace( edge_data.edge_num(), edge.get() );
384  }
385 
386  // Find and process the pqueries.
387  auto place_it = doc.find( "placements" );
388  if( place_it == doc.end() || ! place_it->is_array() ) {
389  throw std::runtime_error(
390  "Jplace document does not contain pqueries at key 'placements'."
391  );
392  }
393  for( auto& pqry_obj : *place_it ) {
394  if( ! pqry_obj.is_object() ) {
395  throw std::runtime_error(
396  "Jplace document contains a value of type '" + pqry_obj.type_name()
397  + "' instead of an object with a pquery at key 'placements'."
398  );
399  }
400  auto pqry_p_arr = pqry_obj.find( "p" );
401  if( pqry_p_arr == pqry_obj.end() || ! pqry_p_arr->is_array() ) {
402  throw std::runtime_error(
403  "Jplace document contains a pquery at key 'placements' that does not contain an "
404  + std::string( "array of placements at sub-key 'p'." )
405  );
406  }
407 
408  // Create new pquery.
409  auto pqry = Pquery();
410 
411  // Process the placements and store them in the pquery.
412  for( auto const& pqry_fields : *pqry_p_arr ) {
413  if( ! pqry_fields.is_array() ) {
414  throw std::runtime_error(
415  "Jplace document contains a pquery with invalid placement at key 'p'."
416  );
417  }
418 
419  if (pqry_fields.size() != fields.size()) {
420  throw std::runtime_error(
421  "Jplace document contains a placement fields array with different size "
422  + std::string( "than the fields name array." )
423  );
424  }
425 
426  // Process all fields of the placement.
427  auto pqry_place = PqueryPlacement();
428  double distal_length = -1.0;
429  for (size_t i = 0; i < pqry_fields.size(); ++i) {
430  // Up to version 3 of the jplace specification, the p-fields in a jplace document
431  // only contain numbers (float or int), so we can do this check here once for all
432  // fields, instead of repetition for every field. If in the future there are fields
433  // with non-number type, this check has to go into the single field assignments.
434  if (!pqry_fields.at(i).is_number()) {
435  throw std::runtime_error(
436  "Jplace document contains pquery where field " + fields[i]
437  + " is of type '" + pqry_fields.at(i).type_name()
438  + "' instead of a number."
439  );
440  }
441 
442  // Switch on the field name to set the correct value.
443  double pqry_place_val = pqry_fields.at(i).get_number<double>();
444  if (fields[i] == "edge_num") {
445  size_t val_int = pqry_fields.at(i).get_number<size_t>();
446 
447  if (edge_num_map.count( val_int ) == 0) {
448  throw std::runtime_error(
449  "Jplace document contains a pquery where field 'edge_num' has value '"
450  + std::to_string( val_int ) + "', which is not marked in the "
451  + "given tree as an edge_num."
452  );
453  }
454  pqry_place.reset_edge( *edge_num_map.at( val_int ) );
455 
456  } else if (fields[i] == "likelihood") {
457  pqry_place.likelihood = pqry_place_val;
458 
459  } else if (fields[i] == "like_weight_ratio") {
460  pqry_place.like_weight_ratio = pqry_place_val;
461 
462  } else if (fields[i] == "distal_length") {
463  distal_length = pqry_place_val;
464 
465  } else if (fields[i] == "proximal_length") {
466  pqry_place.proximal_length = pqry_place_val;
467 
468  } else if (fields[i] == "pendant_length") {
469  pqry_place.pendant_length = pqry_place_val;
470 
471  } else if (fields[i] == "parsimony") {
472  pqry_place.parsimony = pqry_place_val;
473  }
474  }
475 
476  // The jplace format uses distal length, but we use proximal, so we need to convert here.
477  // We have to do this here (unlike all the other values, which are set in the loop
478  // above), because it may happen that the edge_num field was not yet set while
479  // processing. Also, we only set it if it was actually available in the fields and not
480  // overwritten by the (more appropriate) field for the proximal length.
481  if (distal_length >= 0.0 && pqry_place.proximal_length == 0.0) {
482  auto const& edge_data = pqry_place.edge().data<PlacementEdgeData>();
483  pqry_place.proximal_length = edge_data.branch_length - distal_length;
484  }
485 
486  auto invalid_number_checker = [this] (
487  double& actual,
488  std::function<bool (double, double)> comparator,
489  double expected,
490  std::string error_message
491  ) {
492  if(
493  comparator( actual, expected ) && (
496  )) {
497  LOG_WARN << error_message;
498  }
499 
500  if(
501  comparator( actual, expected ) && (
504  )) {
505  actual = expected;
506  }
507 
508  if(
509  comparator( actual, expected ) && (
511  )) {
512  throw std::runtime_error( error_message );
513  }
514  };
515 
516  // Check validity of placement values.
517  invalid_number_checker(
518  pqry_place.like_weight_ratio,
519  std::less<double>(),
520  0.0,
521  "Invalid placement with like_weight_ratio < 0.0."
522  );
523  invalid_number_checker(
524  pqry_place.like_weight_ratio,
525  std::greater<double>(),
526  1.0,
527  "Invalid placement with like_weight_ratio > 1.0."
528  );
529  invalid_number_checker(
530  pqry_place.pendant_length,
531  std::less<double>(),
532  0.0,
533  "Invalid placement with pendant_length < 0.0."
534  );
535  invalid_number_checker(
536  pqry_place.proximal_length,
537  std::less<double>(),
538  0.0,
539  "Invalid placement with proximal_length < 0.0."
540  );
541  invalid_number_checker(
542  pqry_place.proximal_length,
543  std::greater<double>(),
544  pqry_place.edge().data<PlacementEdgeData>().branch_length,
545  "Invalid placement with proximal_length > branch_length."
546  );
547 
548  // Add the placement to the query and vice versa.
549  pqry.add_placement( pqry_place );
550  }
551 
552  // Check name/named multiplicity validity.
553  if( pqry_obj.count("n") > 0 && pqry_obj.count("nm") > 0 ) {
554  throw std::runtime_error(
555  "Jplace document contains a pquery with both an 'n' and an 'nm' key."
556  );
557  }
558  if( pqry_obj.count("n") == 0 && pqry_obj.count("nm") == 0 ) {
559  throw std::runtime_error(
560  "Jplace document contains a pquery with neither an 'n' nor an 'nm' key."
561  );
562  }
563 
564  // Process names.
565  if( pqry_obj.count("n") > 0 ) {
566  if( ! pqry_obj[ "n" ].is_array() ) {
567  throw std::runtime_error(
568  "Jplace document contains a pquery with key 'n' that is not array."
569  );
570  }
571 
572  for( auto const& pqry_n_val : pqry_obj[ "n" ] ) {
573  if( ! pqry_n_val.is_string() ) {
574  throw std::runtime_error(
575  "Jplace document contains a pquery where key 'n' has a non-string field."
576  );
577  }
578 
579  // Add the name with a default multiplicity.
580  pqry.add_name( pqry_n_val.get_string() );
581  }
582  }
583 
584  // Process named multiplicities.
585  if( pqry_obj.count("nm") > 0 ) {
586  if ( ! pqry_obj[ "nm" ].is_array() ) {
587  throw std::runtime_error(
588  "Jplace document contains a pquery with key 'nm' that is not array."
589  );
590  }
591 
592  for( auto const& pqry_nm_val : pqry_obj[ "nm" ] ) {
593  if( ! pqry_nm_val.is_array() ) {
594  throw std::runtime_error(
595  "Jplace document contains a pquery where key 'nm' has a non-array field."
596  );
597  }
598 
599  if( pqry_nm_val.size() != 2 ) {
600  throw std::runtime_error(
601  std::string( "Jplace document contains a pquery where key 'nm' has an " )
602  + "array field with size != 2 (one for the name, one for the multiplicity)."
603  );
604  }
605  if( ! pqry_nm_val.at(0).is_string() ) {
606  throw std::runtime_error(
607  std::string( "Jplace document contains a pquery where key 'nm' has an " )
608  + "array whose first value is not a string for the name."
609  );
610  }
611  if( ! pqry_nm_val.at(1).is_number() ) {
612  throw std::runtime_error(
613  std::string( "Jplace document contains a pquery where key 'nm' has an " )
614  + "array whose second value is not a number for the multiplicity."
615  );
616  }
617 
618  auto pqry_name = PqueryName();
619  pqry_name.name = pqry_nm_val.at(0).get_string();
620  pqry_name.multiplicity = pqry_nm_val.at(1).get_number<double>();
621  if (pqry_name.multiplicity < 0.0) {
622  LOG_WARN << "Jplace document contains pquery with negative multiplicity at "
623  << "name '" << pqry_name.name << "'.";
624  }
625 
626  pqry.add_name( pqry_name );
627  }
628  }
629 
630  // Finally, add the pquery to the smp object.
631  smp.add( pqry );
632  pqry_obj.clear();
633  }
634 }
635 
636 // =================================================================================================
637 // Jplace Version
638 // =================================================================================================
639 
640 std::string JplaceReader::version ()
641 {
642  return "3";
643 }
644 
645 bool JplaceReader::check_version ( size_t version )
646 {
647  return version == 2 || version == 3;
648 }
649 
650 bool JplaceReader::check_version ( std::string const& version )
651 {
652  auto v = utils::trim( version );
653  return v == "2" || v == "3";
654 }
655 
656 // =================================================================================================
657 // Properties
658 // =================================================================================================
659 
661 {
662  return invalid_number_behaviour_;
663 }
664 
666 {
667  invalid_number_behaviour_ = val;
668  return *this;
669 }
670 
671 } // namespace placement
672 } // namespace genesis
SampleSet from_files(std::vector< std::string > const &fns) const
Read a list of files and parse them as a Jplace document into a SampleSet.
typename ContainerType< TreeEdge >::const_iterator ConstIteratorEdges
Definition: tree/tree.hpp:132
JsonDocument from_file(const std::string &filename) const
Take a JSON document file path and parse its contents into a JsonDocument.
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:311
Read Json data into a JsonDocument.
Sample from_document(utils::JsonDocument &doc) const
Take a JsonDocument and parse it as a Jplace document into a Sample.
std::string file_filename(std::string const &filename)
Remove extension if present.
Definition: fs.cpp:296
static bool check_version(size_t version)
Checks whether the version of the jplace format works with this parser.
InvalidNumberBehaviour invalid_number_behaviour() const
Return the currenlty set InvalidNumberBehaviour.
#define LOG_WARN
Log a warning. See genesis::utils::LoggingLevel.
Definition: logging.hpp:95
static std::string version()
Returns the version number that this class is written for. Currently, this is "3".
Throw an std::runtime_error when encountering an invalid number.
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:373
Provides some valuable additions to STD.
JsonDocument from_string(const std::string &json) const
Take a string containing a JSON document and parse its contents into a JsonDocument.
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
Sample from_file(std::string const &fn) const
Read a file and parse it as a Jplace document into a Sample.
std::string file_basename(std::string const &filename)
Remove directory name from file name if present.
Definition: fs.cpp:285
Provides some commonly used string utility functions.
Provides functions for accessing the file system.
iterator find(typename JsonDocument::ObjectType::key_type key)
Find an element in a JSON object.
Provides easy and fast logging functionality.
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.hpp:234
iterator end()
Return an iterator to one past the last element.
void add(Sample const &smp)
Add a Sample to the SampleSet.
Definition: sample_set.hpp:113
Store a Json value of any kind.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
Sample from_string(std::string const &jplace) const
Parse a string as a Jplace document into a Sample.
Correct invalid numbers to the next best correct number.
JsonDocument from_stream(std::istream &input_stream) const
Read from a stream containing a JSON document and parse its contents into a JsonDocument.
bool has_correct_edge_nums(PlacementTree const &tree)
Verify that the tree has correctly set edge nums.
bool is_object() const
Return true iff the JSON value is an object.
Sample from_stream(std::istream &is) const
Read jplace data from a stream into a Sample.
SampleSet from_strings(std::vector< std::string > const &jps) const
Parse a list of strings as a Jplace document into a SampleSet.
InvalidNumberBehaviour
Enum to determine the behaviour of the reader in case of invalid numbers.