A toolkit for working with phylogenetic data.
v0.24.0
utils/containers/dataframe/operators.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
37 #include <algorithm>
38 #include <cassert>
39 #include <cstdint>
40 #include <functional>
41 #include <stdexcept>
42 #include <unordered_set>
43 
44 namespace genesis {
45 namespace utils {
46 
47 // ================================================================================================
48 // Check Conversion Functions
49 // ================================================================================================
50 
51 static bool is_numerical_type_( Dataframe const& df, size_t col_index )
52 {
53  // Try all supported types. Probably, some template meta-programming could help here...
54  return df[col_index].is<float>() || df[col_index].is<double>()
55  || df[col_index].is<int8_t>() || df[col_index].is<int16_t>()
56  || df[col_index].is<int32_t>() || df[col_index].is<int64_t>()
57  || df[col_index].is<uint8_t>() || df[col_index].is<uint16_t>()
58  || df[col_index].is<uint32_t>() || df[col_index].is<uint64_t>();
59 }
60 
61 bool is_convertible_to_bool( Dataframe const& df, size_t col_index )
62 {
63  // Basic checks.
64  if( col_index >= df.cols() ) {
65  throw std::invalid_argument( "Dataframe column index is out of range." );
66  }
67 
68  // Certain strings ("yes", "no" etc) can be converted to bool.
69  if( df[col_index].is<std::string>() ) {
70  auto const& df_cast = df[col_index].as<std::string>();
71  return is_convertible_to_bool( df_cast.begin(), df_cast.end() );
72  }
73 
74  // If its not a string, we can only convert numerical types.
75  return is_numerical_type_( df, col_index );
76 }
77 
78 bool is_convertible_to_bool( Dataframe const& df, std::string const& col_name )
79 {
80  // Throws if column name is not found.
81  return is_convertible_to_bool( df, df.col_index( col_name ));
82 }
83 
84 bool is_convertible_to_double( Dataframe const& df, size_t col_index )
85 {
86  if( col_index >= df.cols() ) {
87  throw std::invalid_argument( "Dataframe column index is out of range." );
88  }
89 
90  // Certain strings can be converted to double.
91  if( df[col_index].is<std::string>() ) {
92  auto const& df_cast = df[col_index].as<std::string>();
93  return is_convertible_to_double( df_cast.begin(), df_cast.end() );
94  }
95 
96  // If its not a string, we can only convert numerical types.
97  return is_numerical_type_( df, col_index );
98 }
99 
100 bool is_convertible_to_double( Dataframe const& df, std::string const& col_name )
101 {
102  // Throws if column name is not found.
103  return is_convertible_to_double( df, df.col_index( col_name ));
104 }
105 
106 // ================================================================================================
107 // Conversion Functions
108 // ================================================================================================
109 
110 template<typename S, typename T>
111 void convert_to_type_( Dataframe& df, size_t col_index )
112 {
113  auto const& df_cast = df[col_index].as<S>();
114  std::vector<T> conv_col;
115  conv_col.reserve( df_cast.size() );
116  for( size_t i = 0; i < df_cast.size(); ++i ) {
117  conv_col[i] = static_cast<T>( df_cast[i] );
118  }
119  df.replace_col<T>( col_index, conv_col );
120 }
121 
122 template<typename T>
123 void convert_to_type_( Dataframe& df, size_t col_index )
124 {
125  // Try all supported types. Probably, some template meta-programming could help here...
126  // if( df[col_index].is<bool>() ) {
127  // convert_to_type_<bool, T>( df, col_index );
128  // } else
129  if( df[col_index].is<float>() ) {
130  convert_to_type_<float, T>( df, col_index );
131  } else if( df[col_index].is<double>() ) {
132  convert_to_type_<double, T>( df, col_index );
133  } else if( df[col_index].is<int8_t>() ) {
134  convert_to_type_<int8_t, T>( df, col_index );
135  } else if( df[col_index].is<int16_t>() ) {
136  convert_to_type_<int16_t, T>( df, col_index );
137  } else if( df[col_index].is<int32_t>() ) {
138  convert_to_type_<int32_t, T>( df, col_index );
139  } else if( df[col_index].is<int64_t>() ) {
140  convert_to_type_<int64_t, T>( df, col_index );
141  } else if( df[col_index].is<uint8_t>() ) {
142  convert_to_type_<uint8_t, T>( df, col_index );
143  } else if( df[col_index].is<uint16_t>() ) {
144  convert_to_type_<uint16_t, T>( df, col_index );
145  } else if( df[col_index].is<uint32_t>() ) {
146  convert_to_type_<uint32_t, T>( df, col_index );
147  } else if( df[col_index].is<uint64_t>() ) {
148  convert_to_type_<uint64_t, T>( df, col_index );
149  } else {
150  throw std::invalid_argument(
151  "Dataframe column is not of a type that be converted to the target type."
152  );
153  }
154 }
155 
156 void convert_to_bool( Dataframe& df, size_t col_index )
157 {
158  if( col_index >= df.cols() ) {
159  throw std::invalid_argument( "Dataframe column index is out of range." );
160  }
161  if( df[col_index].is<std::string>() ) {
162 
163  // Convert the strings, resolving things like "yes" or "off"
164  auto const& df_cast = df[col_index].as<std::string>();
165  auto const bool_col = convert_to_bool( df_cast.begin(), df_cast.end(), df_cast.size() );
166 
167  // Convert to signed char, because std::vector<bool> is not a container...
168  auto char_col = std::vector<signed char>( bool_col.size() );
169  for( size_t i = 0; i < bool_col.size(); ++i ) {
170  char_col[i] = bool_col[i];
171  }
172 
173  df.replace_col<signed char>( col_index, char_col );
174  } else {
175  // Currently, we do not need number to bool conversion,
176  // and with the given helper functions, it does not work the way we want it to.
177  // So, out of lazyness, we forbid those conversions. For now.
178 
179  // convert_to_type_<char>( df, col_index );
180  throw std::invalid_argument(
181  "Dataframe column conversion to bool is only implemented for strings."
182  );
183  }
184 }
185 
186 void convert_to_bool( Dataframe& df, std::string const& col_name )
187 {
188  // Throws if column name is not found.
189  return convert_to_bool( df, df.col_index( col_name ));
190 }
191 
192 void convert_to_double( Dataframe& df, size_t col_index )
193 {
194  if( col_index >= df.cols() ) {
195  throw std::invalid_argument( "Dataframe column index is out of range." );
196  }
197  if( df[col_index].is<std::string>() ) {
198  auto const& df_cast = df[col_index].as<std::string>();
199  auto const double_col = convert_to_double( df_cast.begin(), df_cast.end(), df_cast.size() );
200  df.replace_col<double>( col_index, double_col );
201  } else {
202  convert_to_type_<double>( df, col_index );
203  }
204 }
205 
206 void convert_to_double( Dataframe& df, std::string const& col_name )
207 {
208  // Throws if column name is not found.
209  return convert_to_double( df, df.col_index( col_name ));
210 }
211 
212 // =================================================================================================
213 // Summarize Columns
214 // =================================================================================================
215 
216 std::string summarize_column_common_( Dataframe const& df, size_t col_index, std::string const& description )
217 {
218  return std::to_string( col_index ) + ": \"" + df[col_index].name() + "\" " + description + "\n";
219 }
220 
221 template<typename T>
222 std::string summarize_column_double_( Dataframe const& df, size_t col_index )
223 {
224  auto const& col_cast = df[col_index].as<T>();
225 
226  // Get the min and max, excluding nan entries.
227  // Then, count the number of valid and total entries,
228  // and use this to determine the number of unused entries.
229  auto const mm = finite_minimum_maximum( col_cast.begin(), col_cast.end() );
230  auto const ip = count_finite_elements( col_cast.begin(), col_cast.end() );
231  assert( ip.first <= ip.second );
232  assert( ip.second == df.rows() );
233  auto const iv = ip.second - ip.first;
234 
236  df, col_index,
237  "(numerical, min: " + std::to_string( mm.min ) + ", max: " + std::to_string( mm.max ) +
238  ", unused entries: " + std::to_string(iv) + ")"
239  );
240 }
241 
242 template<typename T>
243 std::string summarize_column_int_( Dataframe const& df, size_t col_index )
244 {
245  auto const& col_cast = df[col_index].as<T>();
246  auto const mm = std::minmax_element( col_cast.begin(), col_cast.end() );
248  df, col_index,
249  "(numerical, min: " + std::to_string( *mm.first ) + ", max: " +
250  std::to_string( *mm.second ) + ")"
251  );
252 }
253 
254 std::string summarize_column_string_( Dataframe const& df, size_t col_index )
255 {
256  // Make copies to get number of unique entries.
257  auto const& str_cast = df[col_index].as<std::string>();
258  std::unordered_set<std::string> uniq( str_cast.begin(), str_cast.end() );
259 
261  df, col_index,
262  "(string, unique elements: " + std::to_string( uniq.size() ) + ")"
263  );
264 }
265 
266 std::string summarize_column( Dataframe const& df, size_t col_index )
267 {
268  if( df[col_index].is<float>() ) {
269  return summarize_column_double_<float>( df, col_index );
270  } else if( df[col_index].is<double>() ) {
271  return summarize_column_double_<double>( df, col_index );
272  } else if( df[col_index].is<int8_t>() ) {
273  return summarize_column_int_<int8_t>( df, col_index );
274  } else if( df[col_index].is<int16_t>() ) {
275  return summarize_column_int_<int16_t>( df, col_index );
276  } else if( df[col_index].is<int32_t>() ) {
277  return summarize_column_int_<int32_t>( df, col_index );
278  } else if( df[col_index].is<int64_t>() ) {
279  return summarize_column_int_<int64_t>( df, col_index );
280  } else if( df[col_index].is<uint8_t>() ) {
281  return summarize_column_int_<uint8_t>( df, col_index );
282  } else if( df[col_index].is<uint16_t>() ) {
283  return summarize_column_int_<uint16_t>( df, col_index );
284  } else if( df[col_index].is<uint32_t>() ) {
285  return summarize_column_int_<uint32_t>( df, col_index );
286  } else if( df[col_index].is<uint64_t>() ) {
287  return summarize_column_int_<uint64_t>( df, col_index );
288  } else if( df[col_index].is<std::string>() ) {
289  return summarize_column_string_( df, col_index );
290  }
291 
293  df, col_index,
294  "(unknown data type)"
295  );
296 }
297 
298 std::string summarize_column( Dataframe const& df, std::string const& col_name )
299 {
300  // Throws if column name is not found.
301  return summarize_column( df, df.col_index( col_name ));
302 }
303 
304 std::string summarize_columns( Dataframe const& df )
305 {
306  std::string result = "Data contains " + std::to_string( df.rows() ) + " rows, and the following columns:\n";
307  for( size_t i = 0; i < df.cols(); ++i ) {
308  result += summarize_column( df, i );
309  }
310  return result;
311 }
312 
313 // =================================================================================================
314 // Merging and Combining
315 // =================================================================================================
316 
317 // void append(
318 // Dataframe& target,
319 // Dataframe const& appendix,
320 // size_t col_index,
321 // DataframeMergeMode mode
322 // ) {
323 // auto const& col_name = appendix.col_name(col_index);
324 // if( col_name.empty() ) {
325 // throw std::runtime_error( "Cannot append dataframe with empty column names." );
326 // }
327 // if( target.has_col_name( col_name ) ) {
328 // throw std::runtime_error( "Column '" + col_name + "' already exists in target dataframe." );
329 // }
330 // auto& new_col = target.add_col( col_name );
331 //
332 // for( size_t r = 0; r < target.rows(); ++r ) {
333 // auto const& row_name = target.row_name(r);
334 //
335 // // Throws if not present in appendix
336 // auto const ari = appendix.row_index( row_name );
337 //
338 // new_col[r] = appendix[col_index][ari];
339 // }
340 // }
341 //
342 // void append(
343 // Dataframe& target,
344 // Dataframe const& appendix,
345 // std::string const& col_name,
346 // DataframeMergeMode mode
347 // ) {
348 //
349 // }
350 //
351 // void append(
352 // Dataframe& target,
353 // Dataframe const& appendix,
354 // DataframeMergeMode mode
355 // ) {
356 // for( size_t c = 0; c < appendix.cols(); ++c ) {
357 // append( target, appendix, c, mode );
358 // }
359 // }
360 //
361 // /**
362 // * @brief Merge two Dataframe%s by appending the columns of the second one (@p appendix) to the
363 // * first one (@p target).
364 // *
365 // * The @p appendix needs to contain at least all the rows (by name) of the @p target.
366 // * This necessiates that all rows in @p target need to be named (no empty names).
367 // * Also, the column names need to be distinct.
368 // */
369 // template< typename T >
370 // void append( Dataframe<T>& target, Dataframe<T> const& appendix )
371 // {
372 // // TODO this could also simply be an overload of df.add_col() ...
373 //
374 //
375 // }
376 //
377 // template< typename T >
378 // Dataframe<T> combine( Dataframe<T> const& lhs, Dataframe<T> const& rhs )
379 // {
380 // // First copy lhs and add all columns of rhs to it.
381 // auto result = lhs;
382 // append( result, rhs );
383 //
384 // // Now add all additional rows of rhs.
385 // for( size_t r = 0; r < rhs.rows(); ++r ) {
386 // auto const& row_name = rhs.row_name(r);
387 // if( ! result.has_row_name( row_name )) {
388 //
389 // // Add the missing row and copy over all data for the columns of rhs.
390 // result.add_row( row_name );
391 // for( auto const& col : rhs ) {
392 // result[ col.name() ][ row_name ] = col[ row_name ];
393 // }
394 // }
395 // }
396 //
397 // return result;
398 // }
399 
400 // ================================================================================================
401 // Helpful Functions
402 // ================================================================================================
403 
404 bool validate( Dataframe const& df )
405 {
406  if( df.col_names_.size() != df.columns_.size() ) {
407  return false;
408  }
409 
410  for( size_t i = 0; i < df.columns_.size(); ++i ) {
411  if( df.columns_[i]->size() != df.row_names_.size() ) {
412  return false;
413  }
414  if( df.columns_[i]->index() != i ) {
415  return false;
416  }
417  if( &df.columns_[i]->dataframe() != &df ) {
418  return false;
419  }
420  }
421 
422  for( auto const& rl : df.row_lookup_ ) {
423  if( rl.second >= df.row_names_.size() ) {
424  return false;
425  }
426  if( rl.first != df.row_names_[ rl.second ] ) {
427  return false;
428  }
429  }
430 
431  for( auto const& cl : df.col_lookup_ ) {
432  if( cl.second >= df.col_names_.size() ) {
433  return false;
434  }
435  if( cl.first != df.col_names_[ cl.second ] ) {
436  return false;
437  }
438  }
439 
440  return true;
441 }
442 
443 } // namespace utils
444 } // namespace genesis
bool is_convertible_to_double(Dataframe const &df, size_t col_index)
bool is_convertible_to_bool(Dataframe const &df, size_t col_index)
void convert_to_bool(Dataframe &df, size_t col_index)
void convert_to_type_(Dataframe &df, size_t col_index)
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
void convert_to_double(Dataframe &df, size_t col_index)
std::string summarize_columns(Dataframe const &df)
Column< T > & replace_col(size_type at_index)
size_t col_index(std::string const &col_name) const
std::string summarize_column_int_(Dataframe const &df, size_t col_index)
Provides some commonly used string utility functions.
static bool is_numerical_type_(Dataframe const &df, size_t col_index)
std::string summarize_column_string_(Dataframe const &df, size_t col_index)
bool validate(Dataframe const &df)
Merge two Dataframes by appending the columns of the second one (appendix) to the first one (target)...
std::string summarize_column(Dataframe const &df, size_t col_index)
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
std::string summarize_column_double_(Dataframe const &df, size_t col_index)
std::string summarize_column_common_(Dataframe const &df, size_t col_index, std::string const &description)
MinMaxPair< double > finite_minimum_maximum(ForwardIterator first, ForwardIterator last)
Return the minimum and the maximum of a range of double values.
Definition: statistics.hpp:239
std::pair< size_t, size_t > count_finite_elements(ForwardIterator first, ForwardIterator last)
Count the number of finite elements in a range of double values.
Definition: statistics.hpp:150