55 for(
size_t i = 0; i < df.
rows(); ++i ) {
61 ss <<
"Data contains " << result.
rows() <<
" rows, and the following columns:\n";
64 for(
size_t i = 0; i < df.
cols(); ++i ) {
65 if( df[i].is<double>() ) {
68 auto const& dbl_col = df[i].as<
double>();
76 assert( ip.first <= ip.second );
77 assert( ip.second == df.
rows() );
78 auto const iv = ip.second - ip.first;
81 ss << i <<
": \"" << df[i].name() <<
"\" (numerical, min: " << ( mm.min );
82 ss <<
", max: " << ( mm.max ) <<
", unused entries: " << iv <<
")\n";
84 }
else if( df[i].is<std::string>() ) {
85 auto const& df_col = df[i].as<std::string>();
92 df_col.begin(), df_col.end(), df_col.size()
99 for(
size_t j = 0; j < bool_col.size(); ++j ) {
100 true_cnt += ( bool_col[j] == 1.0 ? 1 : 0 );
101 false_cnt += ( bool_col[j] == 0.0 ? 1 : 0 );
103 assert( bool_col.size() >= true_cnt );
104 assert( bool_col.size() >= false_cnt );
106 assert( ip.first <= ip.second );
107 assert( ip.second == df.
rows() );
108 auto const iv = ip.second - ip.first;
111 ss << i <<
": \"" << df[i].name() <<
"\" (binary, true: " << ( true_cnt );
112 ss <<
", false: " << ( false_cnt ) <<
", unused entries: " << iv <<
")\n";
116 auto const dbl_col =
convert_to_double( df_col.begin(), df_col.end(), df_col.size() );
124 assert( ip.first <= ip.second );
125 assert( ip.second == df.
rows() );
126 auto const iv = ip.second - ip.first;
129 ss << i <<
": \"" << df[i].name() <<
"\" (numerical, min: " << ( mm.min );
130 ss <<
", max: " << ( mm.max ) <<
", unused entries: " << iv <<
")\n";
139 std::vector<std::string>{},
140 std::vector<std::string>{
""}
147 for(
size_t j = 0; j < fact_df.cols(); ++j ) {
148 assert( fact_df[j].Dataframe::ColumnBase::is<double>() );
149 auto const& fact_col = fact_df[j].Dataframe::ColumnBase::as<
double>();
150 result.
add_col<
double>( df_col.name() +
"." + fact_col.name(), fact_col );
154 size_t empty_cnt = 0;
155 for(
auto const& c : df_col ) {
160 assert( empty_cnt <= df_col.size() );
163 ss << i <<
": \"" << df[i].name() <<
"\" (categorical, levels: ";
164 ss << fact.levels.size() <<
", unused entries: " << empty_cnt <<
")\n";
170 throw std::invalid_argument(
171 "Can only use Dataframe Columns of types double or std::string for preparing " 189 std::vector<std::string> row_order
194 if( ! row_order.empty() && row_order.size() != df.
rows() ) {
195 throw std::runtime_error(
"Row order has to be empty or of same size as Dataframe rows." );
197 auto const row_names = ( row_order.empty() ? df.
row_names() : row_order );
198 assert( row_names.size() == df.
rows() );
199 assert( result.rows() == df.
rows() );
200 assert( result.cols() == df.
cols() );
203 for(
size_t c = 0; c < df.
cols(); ++c ) {
204 if( ! df[c].Dataframe::ColumnBase::is<double>() ) {
205 throw std::runtime_error(
"GLM Dataframe conversion expects Columns of type double." );
207 auto const& col = df[c].Dataframe::ColumnBase::as<
double>();
210 for(
size_t r = 0; r < row_names.size(); ++r ) {
211 auto const& row_name = row_names[r];
213 throw std::runtime_error(
"Invalid row name in Dataframe for GLM conversion." );
216 result( r, c ) = col[ row_name ];
std::string const & row_name(size_type row_index) const
bool is_convertible_to_double(Dataframe const &df, size_t col_index)
Column< T > & add_col(std::string const &name)
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
void convert_to_double(Dataframe &df, size_t col_index)
std::string const & col_name(size_type col_index) const
Matrix< double > glm_convert_dataframe(Dataframe const &df, std::vector< std::string > row_order)
double convert_to_bool_double(std::string const &str)
bool is_convertible_to_bool_double(std::string const &str)
Provides some commonly used string utility functions.
self_type & add_row(std::string const &name)
std::vector< std::string > const & row_names() const
GlmFactor< typename ForwardIterator::value_type > glm_factor(ForwardIterator first, ForwardIterator last, std::vector< typename ForwardIterator::value_type > const &levels, std::vector< typename ForwardIterator::value_type > const &exclude)
Reduce a list of values in the given range to a set of unique factors.
bool has_row_name(std::string const &row_name) const
Dataframe glm_prepare_dataframe(Dataframe const &df, std::string &report)
Dataframe glm_indicator_variables(GlmFactor< T > const &factor, T const &reference_level, std::vector< std::string > const &row_names=std::vector< std::string >{})
Turn a GlmFactor into a set of (dummy) indicator variables to be used in regression.
MinMaxPair< double > finite_minimum_maximum(ForwardIterator first, ForwardIterator last)
Return the minimum and the maximum of a range of double values.
std::pair< size_t, size_t > count_finite_elements(ForwardIterator first, ForwardIterator last)
Count the number of finite elements in a range of double values.