37 #include <unordered_set>
40 namespace population {
47 std::vector<std::string>
const& sample_names,
48 std::vector<std::string>
const& names_filter,
53 std::unordered_set<std::string> filter_set;
54 for(
auto const& name : names_filter ) {
55 if( filter_set.count( name ) != 0 ) {
56 throw std::invalid_argument(
57 "Cannot apply sample name filter, as filter name \"" + name +
"\" appears "
58 "multiple times in the list of names used for filtering."
61 filter_set.insert( name );
63 assert( filter_set.size() == names_filter.size() );
69 std::unordered_set<std::string> names_set;
70 auto result = std::vector<bool>( sample_names.size(),
false );
71 for(
size_t i = 0; i < sample_names.size(); ++i ) {
72 auto const& name = sample_names[i];
75 if( names_set.count( name ) > 0 ) {
76 throw std::invalid_argument(
77 "Cannot apply sample name filter, as sample name \"" + name +
"\" appears "
78 "multiple times in the sample names."
81 names_set.insert( name );
84 bool const found = ( filter_set.count( name ) > 0 );
85 result[i] = ( found ^ inverse_filter );
87 filter_set.erase( name );
92 if( filter_set.size() > 0 ) {
93 throw std::invalid_argument(
94 "Cannot apply sample name filter, as the list of names to filter contains names that "
95 "do not appear in the sample names, such as \"" + *filter_set.begin() +
"\"."
104 std::vector<bool>
const& sample_filter
108 size_t pop_count = 0;
109 for(
size_t i = 0; i < sample_filter.size(); ++i ) {
110 if( sample_filter[i] ) {
114 assert( pop_count <= sample_filter.size() );
117 return [ sample_filter, pop_count ](
Variant& variant ){
118 if( variant.samples.size() != sample_filter.size() ) {
119 throw std::runtime_error(
120 "Invalid sample filter, which filters a list of " +
121 std::to_string( sample_filter.size() ) +
" samples, while the Variant has " +
127 std::vector<SampleCounts> samples;
128 samples.reserve( pop_count );
129 for(
size_t i = 0; i < sample_filter.size(); ++i ) {
130 if( sample_filter[i] ) {
131 samples.push_back( std::move( variant.samples[i] ));
134 assert( samples.size() == pop_count );
135 variant.samples = std::move( samples );
150 return [ max_depth ](
Variant& variant ){
155 return [ max_depth ](
Variant& variant ){
160 return [ max_depth ](
Variant& variant ){
166 throw std::invalid_argument(
167 "Invalid method provided for make_variant_input_stream_sample_subsampling_transform()"
176 std::shared_ptr<genesis::sequence::SequenceDict> sequence_dict,
177 bool check_sequence_lengths
182 return [ sequence_dict, check_sequence_lengths, current_locus ](
Variant const& variant )
mutable {
188 if( ! current_locus.
empty() ) {
191 if( !
locus_greater( variant.chromosome, variant.position, current_locus, sequence_dict )) {
192 throw std::runtime_error(
193 "Invalid sorting order of input Variants. By default, we expect lexicographical "
194 "sorting of chromosomes, and then sorting by position within chromosomes. "
195 "Alternatively, when a sequence dictionary is specified (such as from a .dict "
196 "or .fai file, or from a reference genome .fasta file), we expect the order of "
197 "chromosomes as specified there. "
198 "Offending input going from " +
to_string( current_locus ) +
" to " +
205 if( check_sequence_lengths && sequence_dict ) {
206 auto const& entry = sequence_dict->
get( variant.chromosome );
207 if( variant.position > entry.length ) {
208 throw std::runtime_error(
209 "The current position " +
211 " of the input Variant is greater than the length of the chromosome "
212 "as specified by the SequenceDict, which is " +
std::to_string( entry.length )
220 current_locus =
GenomeLocus( variant.chromosome, variant.position );
222 throw std::runtime_error(
223 "Invalid empty chromosome or position 0 found in input Variant."
230 std::shared_ptr<genesis::sequence::SequenceDict> sequence_dict
232 return [ sequence_dict ](
Variant const& variant ) {
234 auto const& entry = sequence_dict->
get( variant.chromosome );
235 if( variant.position > entry.length ) {
236 throw std::runtime_error(
237 "The current position " +
239 " of the input Variant is greater than the length of the chromosome "
240 "as specified by the SequenceDict, which is " +
std::to_string( entry.length )