A library for working with phylogenetic data.
v0.25.0
sync_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2021 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
36 
37 #include <cassert>
38 #include <stdexcept>
39 
40 namespace genesis {
41 namespace population {
42 
43 // =================================================================================================
44 // Reading & Parsing
45 // =================================================================================================
46 
47 std::vector<Variant> SyncReader::read(
48  std::shared_ptr< utils::BaseInputSource > source
49 ) const {
50  std::vector<Variant> result;
51  utils::InputStream it( source );
52 
53  Variant sample_set;
54  while( parse_line_( it, sample_set, {}, false )) {
55  result.push_back( std::move( sample_set ));
56  sample_set = Variant{};
57  }
58  return result;
59 }
60 
61 std::vector<Variant> SyncReader::read(
62  std::shared_ptr< utils::BaseInputSource > source,
63  std::vector<bool> const& sample_filter
64 ) const {
65  std::vector<Variant> result;
66  utils::InputStream it( source );
67 
68  Variant sample_set;
69  while( parse_line_( it, sample_set, sample_filter, true )) {
70  result.push_back( std::move( sample_set ));
71  sample_set = Variant{};
72  }
73  return result;
74 }
75 
77  utils::InputStream& input_stream,
78  Variant& sample_set
79 ) const {
80  return parse_line_( input_stream, sample_set, {}, false );
81 }
82 
84  utils::InputStream& input_stream,
85  Variant& sample_set,
86  std::vector<bool> const& sample_filter
87 ) const {
88  return parse_line_( input_stream, sample_set, sample_filter, true );
89 }
90 
91 // =================================================================================================
92 // Internal Parsing
93 // =================================================================================================
94 
95 bool SyncReader::parse_line_(
96  utils::InputStream& input_stream,
97  Variant& sample_set,
98  std::vector<bool> const& sample_filter,
99  bool use_sample_filter
100 ) const {
101  using namespace genesis::utils;
102 
103  // Shorthand.
104  auto& it = input_stream;
105  if( !it ) {
106  return false;
107  }
108 
109  // Read fixed columns for chromosome and position.
110  sample_set.chromosome = utils::read_until( it, []( char c ){ return c == '\t' || c == '\n'; });
111  utils::read_char_or_throw( it, '\t' );
112  sample_set.position = utils::parse_unsigned_integer<size_t>( it );
113  utils::read_char_or_throw( it, '\t' );
114  if( !it || *it == '\n' ) {
115  throw std::runtime_error(
116  std::string("In ") + it.source_name() + ": Unexpected end of line at " + it.at()
117  );
118  }
119 
120  // Read and check fixed column for the refererence base.
121  auto const rb = to_upper( *it );
122  if( rb != 'A' && rb != 'C' && rb != 'G' && rb != 'T' && rb != 'N' && rb != '.' && rb != '*' ) {
123  throw std::runtime_error(
124  std::string("In ") + it.source_name() + ": Invalid reference base char " +
125  char_to_hex(rb) + " at " + it.at()
126  );
127  }
128  sample_set.reference_base = rb;
129  ++it;
130 
131  // Read the samples. We switch once for the first line, and thereafter check that we read the
132  // same number of samples each time.
133  if( sample_set.samples.empty() ) {
134  size_t src_index = 0;
135  while( it && *it != '\n' ) {
136  if( !use_sample_filter || ( src_index < sample_filter.size() && sample_filter[src_index] )) {
137  sample_set.samples.emplace_back();
138  parse_sample_( it, sample_set.samples.back() );
139  } else {
140  skip_sample_( it );
141  }
142  ++src_index;
143  }
144  } else {
145  // Here we need two indices, one over the samples in the file (source),
146  // and one for the samples that we are writing in our Variant (destination).
147  size_t src_index = 0;
148  size_t dst_index = 0;
149  while( it && *it != '\n' ) {
150  // If the numbers do not match, go straight to the error check and throw.
151  if( dst_index >= sample_set.samples.size() ) {
152  break;
153  }
154 
155  // Parse or skip, depending on filter.
156  if( !use_sample_filter || ( src_index < sample_filter.size() && sample_filter[src_index] )) {
157  assert( dst_index < sample_set.samples.size() );
158  parse_sample_( it, sample_set.samples[dst_index] );
159  ++dst_index;
160  } else {
161  skip_sample_( it );
162  }
163  ++src_index;
164  }
165 
166  // Need to have the exact size of samples in the line.
167  if( dst_index != sample_set.samples.size() ) {
168  throw std::runtime_error(
169  "Malformed sync " + it.source_name() + " at " + it.at() +
170  ": Line with different number of samples."
171  );
172  }
173  }
174 
175  assert( !it || *it == '\n' );
176  ++it;
177  return true;
178 }
179 
180 void SyncReader::parse_sample_(
181  utils::InputStream& input_stream,
182  BaseCounts& sample
183 ) const {
184  using namespace genesis::utils;
185  read_char_or_throw( input_stream, '\t' );
186 
187  // The allele frequencies are stored in the order `A:T:C:G:N:del`,
188  // see https://sourceforge.net/p/popoolation2/wiki/Tutorial/
189  sample.a_count = parse_unsigned_integer<size_t>( input_stream );
190  read_char_or_throw( input_stream, ':' );
191  sample.t_count = parse_unsigned_integer<size_t>( input_stream );
192  read_char_or_throw( input_stream, ':' );
193  sample.c_count = parse_unsigned_integer<size_t>( input_stream );
194  read_char_or_throw( input_stream, ':' );
195  sample.g_count = parse_unsigned_integer<size_t>( input_stream );
196  read_char_or_throw( input_stream, ':' );
197  sample.n_count = parse_unsigned_integer<size_t>( input_stream );
198  read_char_or_throw( input_stream, ':' );
199  sample.d_count = parse_unsigned_integer<size_t>( input_stream );
200 }
201 
202 void SyncReader::skip_sample_(
203  utils::InputStream& input_stream
204 ) const {
205  using namespace genesis::utils;
206 
207  // The skip functions are slow, because they need char by char access to the input stream.
208  // Need to fix this at some point. For now, just read into an unused dummy.
209  BaseCounts dummy;
210  parse_sample_( input_stream, dummy );
211 
212  // Simply skip everything.
213  // read_char_or_throw( input_stream, '\t' );
214  // skip_while( input_stream, is_digit );
215  // read_char_or_throw( input_stream, ':' );
216  // skip_while( input_stream, is_digit );
217  // read_char_or_throw( input_stream, ':' );
218  // skip_while( input_stream, is_digit );
219  // read_char_or_throw( input_stream, ':' );
220  // skip_while( input_stream, is_digit );
221  // read_char_or_throw( input_stream, ':' );
222  // skip_while( input_stream, is_digit );
223  // read_char_or_throw( input_stream, ':' );
224  // skip_while( input_stream, is_digit );
225 }
226 
227 } // namespace population
228 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:80
parser.hpp
genesis::population::Variant::position
size_t position
Definition: variant.hpp:65
sync_reader.hpp
genesis::population::Variant::reference_base
char reference_base
Definition: variant.hpp:66
genesis::population::SyncReader::parse_line
bool parse_line(utils::InputStream &input_stream, Variant &sample_set) const
Definition: sync_reader.cpp:76
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::population::SyncReader::read
std::vector< Variant > read(std::shared_ptr< utils::BaseInputSource > source) const
Definition: sync_reader.cpp:47
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:227
genesis::utils::read_char_or_throw
char read_char_or_throw(InputStream &source, char criterion, SkipWhitespace skip_ws=SkipWhitespace::kNone)
Lexing function that reads a single char from the stream and checks whether it equals the provided on...
Definition: scanner.hpp:299
genesis::population::Variant::samples
std::vector< BaseCounts > samples
Definition: variant.hpp:69
genesis::utils::read_until
std::string read_until(InputStream &source, char criterion)
Lexing function that reads from the stream until its current char equals the provided one....
Definition: scanner.hpp:254
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
char.hpp
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:67
scanner.hpp
genesis::population::Variant::chromosome
std::string chromosome
Definition: variant.hpp:64