75 std::shared_ptr<utils::BaseInputSource> source
87 throw std::runtime_error(
"Json value is not a Json document." );
92 for(
auto const& kv : keymap ) {
93 auto const& key = kv.first;
95 key !=
"version" && key !=
"tree" && key !=
"placements" &&
96 key !=
"fields" && key !=
"metadata"
98 LOG_WARN <<
"Jplace document contains top-level key '" << key <<
"', which is not part "
99 <<
"of the jplace standard and hence ignored. This might indicate an issue "
100 <<
"with the data or the program which generated the document.";
105 process_jplace_version_( doc );
106 process_jplace_metadata_( doc, smp );
109 process_jplace_tree_( doc, smp );
110 auto const fields = process_jplace_fields_( doc );
111 process_jplace_placements_( doc, smp, fields );
117 std::vector<std::shared_ptr<utils::BaseInputSource>> sources
120 read( sources, target );
125 std::vector<std::shared_ptr<utils::BaseInputSource>> sources,
132 auto tmp = std::vector<Sample>( sources.size() );
135 #pragma omp parallel for
136 for(
size_t i = 0; i < sources.size(); ++i ) {
137 tmp[ i ] =
read( sources[i] );
141 for(
size_t i = 0; i < sources.size(); ++i ) {
145 if( ex ==
"jplace" ) {
149 target.
add( std::move( tmp[i] ), name );
164 auto v_it = doc.
find(
"version" );
165 if( v_it == doc.
end() ) {
171 assert( v_it != doc.
end() );
172 if( v_it->is_string() ) {
174 version = std::stoi( v_it->get_string() );
178 }
else if( v_it->is_number_unsigned() ) {
179 version = v_it->get_number_unsigned();
188 void JplaceReader::process_jplace_version_( utils::JsonDocument
const& doc )
const
190 auto const version = get_jplace_version_( doc );
193 if( version == -1 ) {
194 LOG_WARN <<
"Jplace document does not contain a valid version number at key 'version'. "
195 <<
"Now continuing to parse in the hope that it still works.";
200 if( version == 0 || version >= 4 ) {
201 LOG_WARN <<
"Jplace document has version " << version <<
" specified at the 'version' key. "
202 <<
"We can process versions 1-3 of the jplace standard, "
203 <<
"but now still continue to parse in the hope that it works.";
211 void JplaceReader::process_jplace_metadata_( utils::JsonDocument
const& doc, Sample& smp )
const
214 auto meta_it = doc.find(
"metadata" );
215 if( meta_it != doc.end() && meta_it->is_object() ) {
216 for(
auto it = meta_it->begin(); it != meta_it->end(); ++it ) {
219 if( it.value().is_string() ) {
220 smp.metadata[ it.key() ] = it.value().get_string();
222 LOG_WARN <<
"Jplace document contains meta-data at key '" << it.key()
223 <<
"' that is not stored as a string, and hence ignored.";
233 void JplaceReader::process_jplace_tree_( utils::JsonDocument
const& doc, Sample& smp )
const
238 auto const version = get_jplace_version_( doc );
239 auto reader = PlacementTreeNewickReader();
241 reader.get_edge_num_from_comments(
true );
245 auto tree_it = doc.find(
"tree" );
246 if( tree_it == doc.end() || ! tree_it->is_string() ) {
247 throw std::runtime_error(
248 "Jplace document does not contain a valid Newick tree at key 'tree'."
260 std::vector<std::string> JplaceReader::process_jplace_fields_( utils::JsonDocument
const& doc )
const
263 auto fields_it = doc.find(
"fields" );
264 if( fields_it == doc.end() || ! fields_it->is_array() ) {
265 throw std::runtime_error(
"Jplace document does not contain field names at key 'fields'." );
269 std::vector<std::string> fields;
270 for(
auto const& fields_val : *fields_it ) {
271 if( ! fields_val.is_string() ) {
272 throw std::runtime_error(
273 "Jplace document contains a value of type '" + fields_val.type_name()
274 +
"' instead of a string with a field name at key 'fields'."
279 std::string
const& field = fields_val.get_string();
280 if( field ==
"edge_num" || field ==
"likelihood" || field ==
"like_weight_ratio" ||
281 field ==
"distal_length" || field ==
"pendant_length" || field ==
"proximal_length"
285 if( std::any_of( fields.begin(), fields.end(), [&]( std::string
const& fn ){
288 throw std::runtime_error(
289 "Jplace document contains field name '" + field
290 +
"' more than once at key 'fields'."
294 field ==
"parsimony" || field ==
"post_prob" || field ==
"marginal_like" ||
295 field ==
"marginal_prob" || field ==
"classification" || field ==
"map_ratio" ||
296 field ==
"map_overlap"
299 LOG_WARN <<
"Jplace document contains a field name '" << field <<
"' at key 'fields', "
300 <<
"which is part of the jplace standard, but not used by any of our functions, "
301 <<
"and hence ignored.";
304 LOG_WARN <<
"Jplace document contains a field name '" << field <<
"' at key 'fields', "
305 <<
"which is not part of the jplace standard, and hence ignored.";
307 fields.push_back(field);
312 std::vector<std::string> required_fields = {
313 "edge_num",
"likelihood",
"like_weight_ratio",
"pendant_length"
315 for(
auto const& req : required_fields ) {
317 throw std::runtime_error(
318 "Jplace document does not contain necessary field '" + req +
"' at key 'fields'."
322 auto const contains_distal =
utils::contains( fields,
"distal_length" );
323 auto const contains_proximal =
utils::contains( fields,
"proximal_length" );
324 if( ! contains_distal && ! contains_proximal ) {
325 throw std::runtime_error(
326 "Jplace document does not contain one of the necessary fields 'distal_length' "
327 "or 'proximal_length' at key 'fields'."
330 if( contains_distal && contains_proximal ) {
331 LOG_WARN <<
"Jplace document contains both fields 'distal_length', and 'proximal_length'. "
332 <<
"Currently, only one value is used internally to represent both, which might "
333 <<
"lead to inconsistency if the sum of both is not equal to the branch length.";
335 assert( contains_distal || contains_proximal );
344 void JplaceReader::process_jplace_placements_(
345 utils::JsonDocument& doc,
347 std::vector<std::string>
const& fields
352 std::unordered_map<size_t, PlacementTreeEdge*> edge_num_map;
353 for(
auto& edge : smp.tree().edges() ) {
354 auto& edge_data = edge.data<PlacementEdgeData>();
355 if (edge_num_map.count( edge_data.edge_num()) > 0) {
356 throw std::runtime_error(
357 "Jplace document contains a tree where the edge_num tag '"
358 +
std::to_string( edge_data.edge_num() ) +
"' is used more than once, "
359 "and hence cannot be used to uniquely identify edges of the placements. "
360 "This indicates a severe issue with the program that created the jplace file."
363 edge_num_map.emplace( edge_data.edge_num(), &edge );
367 auto place_it = doc.find(
"placements" );
368 if( place_it == doc.end() || ! place_it->is_array() ) {
369 throw std::runtime_error(
370 "Jplace document does not contain pqueries at key 'placements'."
373 for(
auto& pqry_obj : *place_it ) {
374 if( ! pqry_obj.is_object() ) {
375 throw std::runtime_error(
376 "Jplace document contains a value of type '" + pqry_obj.type_name()
377 +
"' instead of an object with a pquery at key 'placements'."
382 auto pquery = Pquery();
383 process_jplace_placements_p_( pqry_obj, pquery, fields, edge_num_map );
384 process_jplace_placements_nm_( pqry_obj, pquery );
396 void JplaceReader::process_jplace_placements_p_(
397 utils::JsonDocument
const& pqry_obj,
399 std::vector<std::string>
const& fields,
400 std::unordered_map<size_t, PlacementTreeEdge*>
const& edge_num_map
405 auto invalid_number_checker = [
this] (
407 std::function<bool (
double,
double)> comparator,
409 std::string error_message
412 comparator( actual, expected ) && (
420 comparator( actual, expected ) && (
428 comparator( actual, expected ) && (
431 throw std::runtime_error( error_message );
436 assert( pqry_obj.is_object() );
437 auto const pqry_p_arr = pqry_obj.find(
"p" );
438 if( pqry_p_arr == pqry_obj.end() || ! pqry_p_arr->is_array() ) {
439 throw std::runtime_error(
440 "Jplace document contains a pquery at key 'placements' that does not contain an "
441 "array of placements at key 'p'."
444 if( pqry_p_arr->size() == 0 ) {
445 throw std::runtime_error(
446 "Jplace document contains a pquery at key 'placements' that does not contain any "
447 "placements at key 'p'."
452 for(
auto const& pqry_fields : *pqry_p_arr ) {
453 if( ! pqry_fields.is_array() ) {
454 throw std::runtime_error(
455 "Jplace document contains a pquery with invalid placement at key 'p'."
459 if( pqry_fields.size() != fields.size() ) {
460 throw std::runtime_error(
461 "Jplace document contains a placement fields array with different size "
462 "than the fields name array."
467 auto pqry_place = PqueryPlacement();
468 double distal_length = -1.0;
469 pqry_place.proximal_length = 0.0;
472 for(
size_t i = 0; i < pqry_fields.size(); ++i ) {
479 double* target =
nullptr;
480 if( fields[i] ==
"edge_num" ) {
481 size_t val_int = pqry_fields.at(i).get_number<
size_t>();
483 if( edge_num_map.count( val_int ) == 0 ) {
484 throw std::runtime_error(
485 "Jplace document contains a pquery where field 'edge_num' has value '"
486 +
std::to_string( val_int ) +
"', which does not correspond to any "
487 "edge_num in the given Newick tree of the document."
490 pqry_place.reset_edge( *edge_num_map.at( val_int ) );
492 }
else if( fields[i] ==
"likelihood" ) {
493 target = &pqry_place.likelihood;
494 }
else if( fields[i] ==
"like_weight_ratio" ) {
495 target = &pqry_place.like_weight_ratio;
496 }
else if( fields[i] ==
"distal_length" ) {
497 target = &distal_length;
498 }
else if( fields[i] ==
"proximal_length" ) {
499 target = &pqry_place.proximal_length;
500 }
else if( fields[i] ==
"pendant_length" ) {
501 target = &pqry_place.pendant_length;
508 if( !pqry_fields.at(i).is_number() ) {
509 throw std::runtime_error(
510 "Jplace document contains a pquery where the field " + fields[i]
511 +
" is of type '" + pqry_fields.at(i).type_name()
512 +
"' instead of a number."
515 *target = pqry_fields.at(i).get_number<
double>();
524 if( distal_length >= 0.0 && pqry_place.proximal_length == 0.0 ) {
525 auto const& edge_data = pqry_place.edge().data<PlacementEdgeData>();
526 pqry_place.proximal_length = edge_data.branch_length - distal_length;
530 invalid_number_checker(
531 pqry_place.like_weight_ratio,
534 "Invalid placement with like_weight_ratio < 0.0."
536 invalid_number_checker(
537 pqry_place.like_weight_ratio,
538 std::greater<double>(),
540 "Invalid placement with like_weight_ratio > 1.0."
542 invalid_number_checker(
543 pqry_place.pendant_length,
546 "Invalid placement with pendant_length < 0.0."
548 invalid_number_checker(
549 pqry_place.proximal_length,
552 "Invalid placement with proximal_length < 0.0."
554 invalid_number_checker(
555 pqry_place.proximal_length,
556 std::greater<double>(),
557 pqry_place.edge().data<PlacementEdgeData>().branch_length,
558 "Invalid placement with proximal_length > branch_length."
562 pquery.add_placement( pqry_place );
570 void JplaceReader::process_jplace_placements_nm_(
571 utils::JsonDocument
const& pqry_obj,
576 assert( pqry_obj.is_object() );
577 if( pqry_obj.count(
"n") > 0 && pqry_obj.count(
"nm") > 0 ) {
578 throw std::runtime_error(
579 "Jplace document contains a pquery with both an 'n' and an 'nm' key."
582 if( pqry_obj.count(
"n") == 0 && pqry_obj.count(
"nm") == 0 ) {
583 throw std::runtime_error(
584 "Jplace document contains a pquery with neither an 'n' nor an 'nm' key."
587 if( pqry_obj.count(
"m") > 0 && pqry_obj.count(
"n") == 0 ) {
588 throw std::runtime_error(
589 "Jplace document contains a pquery with key 'm' but without 'n' key."
594 if( pqry_obj.count(
"n") > 0 ) {
595 assert( pqry_obj.count(
"nm") == 0 );
601 if( pqry_obj.count(
"m") > 0 ) {
604 if( ! pqry_obj[
"m" ].is_number() ) {
605 throw std::runtime_error(
606 "Jplace document contains a pquery where key 'm' has a "
607 "value is not a valid number for the multiplicity."
614 if( pqry_obj[
"n" ].size() != 1 ) {
615 throw std::runtime_error(
616 "Jplace document contains a pquery with key 'n' that is an array of size greater "
617 "than one, while also having key 'm' for the multiplicity. This is not allowed."
622 m = pqry_obj[
"m" ].get_number<
double>();
627 if( pqry_obj[
"n" ].is_array() ) {
630 if( pqry_obj[
"n" ].size() == 0 ) {
631 throw std::runtime_error(
632 "Jplace document contains a pquery with key 'n' that does not contain any values."
638 assert(!( pqry_obj[
"n" ].size() > 1 && pqry_obj.count(
"m") > 0 ));
641 for(
auto const& pqry_n_val : pqry_obj[
"n" ] ) {
642 if( ! pqry_n_val.is_string() ) {
643 throw std::runtime_error(
644 "Jplace document contains a pquery where key 'n' has a non-string field."
649 pquery.add_name( pqry_n_val.get_string(), m );
652 }
else if( pqry_obj[
"n" ].is_string() ) {
653 pquery.add_name( pqry_obj[
"n" ].get_string(), m );
656 throw std::runtime_error(
657 "Jplace document contains a pquery with key 'n' that "
658 "is neither an array nor a string."
664 if( pqry_obj.count(
"nm") > 0 ) {
665 assert( pqry_obj.count(
"n") == 0 );
666 assert( pqry_obj.count(
"m") == 0 );
669 if ( ! pqry_obj[
"nm" ].is_array() ) {
670 throw std::runtime_error(
671 "Jplace document contains a pquery with key 'nm' that is not array."
674 if( pqry_obj[
"nm" ].size() == 0 ) {
675 throw std::runtime_error(
676 "Jplace document contains a pquery with key 'nm' that does not contain any values."
681 for(
auto const& pqry_nm_val : pqry_obj[
"nm" ] ) {
684 if( ! pqry_nm_val.is_array() ) {
685 throw std::runtime_error(
686 "Jplace document contains a pquery where key 'nm' has a non-array field."
689 if( pqry_nm_val.size() != 2 ) {
690 throw std::runtime_error(
691 std::string(
"Jplace document contains a pquery where key 'nm' has an " )
692 +
"array field with size != 2 (one for the name, one for the multiplicity)."
695 if( ! pqry_nm_val.at(0).is_string() ) {
696 throw std::runtime_error(
697 std::string(
"Jplace document contains a pquery where key 'nm' has an " )
698 +
"array whose first value is not a string for the name."
701 if( ! pqry_nm_val.at(1).is_number() ) {
702 throw std::runtime_error(
703 std::string(
"Jplace document contains a pquery where key 'nm' has an " )
704 +
"array whose second value is not a number for the multiplicity."
709 auto pqry_name = PqueryName();
710 pqry_name.name = pqry_nm_val.at(0).get_string();
711 pqry_name.multiplicity = pqry_nm_val.at(1).get_number<
double>();
712 if (pqry_name.multiplicity < 0.0) {
713 LOG_WARN <<
"Jplace document contains pquery with negative multiplicity at "
714 <<
"name '" << pqry_name.name <<
"'.";
718 pquery.add_name( pqry_name );