A toolkit for working with phylogenetic data.
v0.24.0
dataframe.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <sstream>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace utils {
46 
47 // =================================================================================================
48 // Dataframe Helper Functions
49 // =================================================================================================
50 
51 Dataframe glm_prepare_dataframe( Dataframe const& df, std::string& report )
52 {
53  // Prepare rows of the resulting dataframe.
54  Dataframe result;
55  for( size_t i = 0; i < df.rows(); ++i ) {
56  result.add_row( df.row_name(i) );
57  }
58 
59  // While iterating the dataframe, we also produce some user info.
60  std::stringstream ss;
61  ss << "Data contains " << result.rows() << " rows, and the following columns:\n";
62 
63  // Do the conversions.
64  for( size_t i = 0; i < df.cols(); ++i ) {
65  if( df[i].is<double>() ) {
66 
67  // Simple case: column is already double.
68  auto const& dbl_col = df[i].as<double>();
69  result.add_col<double>( df.col_name(i), dbl_col );
70 
71  // Get the min and max, excluding nan entries.
72  // Then, count the number of valid and total entries,
73  // and use this to determine the number of unused entries.
74  auto const mm = finite_minimum_maximum( dbl_col.begin(), dbl_col.end() );
75  auto const ip = count_finite_elements( dbl_col.begin(), dbl_col.end() );
76  assert( ip.first <= ip.second );
77  assert( ip.second == df.rows() );
78  auto const iv = ip.second - ip.first;
79 
80  // User output.
81  ss << i << ": \"" << df[i].name() << "\" (numerical, min: " << ( mm.min );
82  ss << ", max: " << ( mm.max ) << ", unused entries: " << iv << ")\n";
83 
84  } else if( df[i].is<std::string>() ) {
85  auto const& df_col = df[i].as<std::string>();
86 
87  if( is_convertible_to_bool_double( df_col.begin(), df_col.end() )) {
88 
89  // Convert to bool, but as doubles. This ensures that empty cells
90  // are converted to nan instead of false/0.
91  auto const bool_col = convert_to_bool_double(
92  df_col.begin(), df_col.end(), df_col.size()
93  );
94  result.add_col<double>( df.col_name(i), bool_col );
95 
96  // Count the number of entries for user output.
97  size_t true_cnt = 0;
98  size_t false_cnt = 0;
99  for( size_t j = 0; j < bool_col.size(); ++j ) {
100  true_cnt += ( bool_col[j] == 1.0 ? 1 : 0 );
101  false_cnt += ( bool_col[j] == 0.0 ? 1 : 0 );
102  }
103  assert( bool_col.size() >= true_cnt );
104  assert( bool_col.size() >= false_cnt );
105  auto const ip = count_finite_elements( bool_col.begin(), bool_col.end() );
106  assert( ip.first <= ip.second );
107  assert( ip.second == df.rows() );
108  auto const iv = ip.second - ip.first;
109 
110  // User output.
111  ss << i << ": \"" << df[i].name() << "\" (binary, true: " << ( true_cnt );
112  ss << ", false: " << ( false_cnt ) << ", unused entries: " << iv << ")\n";
113 
114  } else if( is_convertible_to_double( df_col.begin(), df_col.end() )) {
115 
116  auto const dbl_col = convert_to_double( df_col.begin(), df_col.end(), df_col.size() );
117  result.add_col<double>( df.col_name(i), dbl_col );
118 
119  // Get the min and max, excluding nan entries.
120  // Then, count the number of valid and total entries,
121  // and use this to determine the number of unused entries.
122  auto const mm = finite_minimum_maximum( dbl_col.begin(), dbl_col.end() );
123  auto const ip = count_finite_elements( dbl_col.begin(), dbl_col.end() );
124  assert( ip.first <= ip.second );
125  assert( ip.second == df.rows() );
126  auto const iv = ip.second - ip.first;
127 
128  // User output.
129  ss << i << ": \"" << df[i].name() << "\" (numerical, min: " << ( mm.min );
130  ss << ", max: " << ( mm.max ) << ", unused entries: " << iv << ")\n";
131 
132  } else {
133 
134  // No conversion possible. Make it a factor. We exclude empty entries,
135  // as they do not contain any valid information, and hence would add random signal.
136  auto const fact = glm_factor(
137  df_col.begin(),
138  df_col.end(),
139  std::vector<std::string>{},
140  std::vector<std::string>{""}
141  );
142  auto const fact_df = glm_indicator_variables( fact, df.row_names() );
143 
144  // Add factor cols to result. They are named using the format:
145  // <original column name>.<reference level>.<factor level>,
146  // where the level names come from the glm_indicator_variables() function.
147  for( size_t j = 0; j < fact_df.cols(); ++j ) {
148  assert( fact_df[j].Dataframe::ColumnBase::is<double>() );
149  auto const& fact_col = fact_df[j].Dataframe::ColumnBase::as<double>();
150  result.add_col<double>( df_col.name() + "." + fact_col.name(), fact_col );
151  }
152 
153  // Count number of empty entries that were excluded.
154  size_t empty_cnt = 0;
155  for( auto const& c : df_col ) {
156  if( c.empty() ) {
157  ++empty_cnt;
158  }
159  }
160  assert( empty_cnt <= df_col.size() );
161 
162  // User output.
163  ss << i << ": \"" << df[i].name() << "\" (categorical, levels: ";
164  ss << fact.levels.size() << ", unused entries: " << empty_cnt << ")\n";
165  }
166 
167  } else {
168 
169  // We might add other types in the future. For now, we throw instead.
170  throw std::invalid_argument(
171  "Can only use Dataframe Columns of types double or std::string for preparing "
172  "a GLM Matrix."
173  );
174  }
175  }
176 
177  report = ss.str();
178  return result;
179 }
180 
182 {
183  std::string report;
184  return glm_prepare_dataframe( df, report );
185 }
186 
188  Dataframe const& df,
189  std::vector<std::string> row_order
190 ) {
191  auto result = Matrix<double>( df.rows(), df.cols() );
192 
193  // Prepare row name order.
194  if( ! row_order.empty() && row_order.size() != df.rows() ) {
195  throw std::runtime_error( "Row order has to be empty or of same size as Dataframe rows." );
196  }
197  auto const row_names = ( row_order.empty() ? df.row_names() : row_order );
198  assert( row_names.size() == df.rows() );
199  assert( result.rows() == df.rows() );
200  assert( result.cols() == df.cols() );
201 
202  // Iterate columns of the dataframe.
203  for( size_t c = 0; c < df.cols(); ++c ) {
204  if( ! df[c].Dataframe::ColumnBase::is<double>() ) {
205  throw std::runtime_error( "GLM Dataframe conversion expects Columns of type double." );
206  }
207  auto const& col = df[c].Dataframe::ColumnBase::as<double>();
208 
209  // Add row content in the provided order.
210  for( size_t r = 0; r < row_names.size(); ++r ) {
211  auto const& row_name = row_names[r];
212  if( ! df.has_row_name( row_name )) {
213  throw std::runtime_error( "Invalid row name in Dataframe for GLM conversion." );
214  }
215 
216  result( r, c ) = col[ row_name ];
217  }
218  }
219 
220  return result;
221 }
222 
224  Dataframe const& df
225 ) {
226  return glm_convert_dataframe( df, std::vector<std::string>() );
227 }
228 
229 } // namespace utils
230 } // namespace genesis
std::string const & row_name(size_type row_index) const
bool is_convertible_to_double(Dataframe const &df, size_t col_index)
Column< T > & add_col(std::string const &name)
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
void convert_to_double(Dataframe &df, size_t col_index)
std::string const & col_name(size_type col_index) const
Matrix< double > glm_convert_dataframe(Dataframe const &df, std::vector< std::string > row_order)
Definition: dataframe.cpp:187
double convert_to_bool_double(std::string const &str)
Definition: convert.cpp:113
bool is_convertible_to_bool_double(std::string const &str)
Definition: convert.cpp:122
Provides some commonly used string utility functions.
self_type & add_row(std::string const &name)
std::vector< std::string > const & row_names() const
GlmFactor< typename ForwardIterator::value_type > glm_factor(ForwardIterator first, ForwardIterator last, std::vector< typename ForwardIterator::value_type > const &levels, std::vector< typename ForwardIterator::value_type > const &exclude)
Reduce a list of values in the given range to a set of unique factors.
Definition: factor.hpp:88
bool has_row_name(std::string const &row_name) const
Dataframe glm_prepare_dataframe(Dataframe const &df, std::string &report)
Definition: dataframe.cpp:51
Dataframe glm_indicator_variables(GlmFactor< T > const &factor, T const &reference_level, std::vector< std::string > const &row_names=std::vector< std::string >{})
Turn a GlmFactor into a set of (dummy) indicator variables to be used in regression.
Definition: factor.hpp:214
MinMaxPair< double > finite_minimum_maximum(ForwardIterator first, ForwardIterator last)
Return the minimum and the maximum of a range of double values.
Definition: statistics.hpp:239
std::pair< size_t, size_t > count_finite_elements(ForwardIterator first, ForwardIterator last)
Count the number of finite elements in a range of double values.
Definition: statistics.hpp:150