55 for(
size_t i = 0; i < df.
rows(); ++i ) {
61 ss <<
"Data contains " << result.
rows() <<
" rows, and the following columns:\n";
64 for(
size_t i = 0; i < df.
cols(); ++i ) {
65 if( df[i].is<double>() ) {
68 auto const& dbl_col = df[i].as<
double>();
76 assert( ip.first <= ip.second );
77 assert( ip.second == df.
rows() );
78 auto const iv = ip.second - ip.first;
81 ss << i <<
": \"" << df[i].name() <<
"\" (numerical, min: " << ( mm.min );
82 ss <<
", max: " << ( mm.max ) <<
", unused entries: " << iv <<
")\n";
84 }
else if( df[i].is<std::string>() ) {
85 auto const& df_col = df[i].as<std::string>();
92 df_col.begin(), df_col.end(), df_col.size()
99 for(
size_t j = 0; j < bool_col.size(); ++j ) {
100 true_cnt += ( bool_col[j] == 1.0 ? 1 : 0 );
101 false_cnt += ( bool_col[j] == 0.0 ? 1 : 0 );
103 assert( bool_col.size() >= true_cnt );
104 assert( bool_col.size() >= false_cnt );
106 assert( ip.first <= ip.second );
107 assert( ip.second == df.
rows() );
108 auto const iv = ip.second - ip.first;
111 ss << i <<
": \"" << df[i].name() <<
"\" (binary, true: " << ( true_cnt );
112 ss <<
", false: " << ( false_cnt ) <<
", unused entries: " << iv <<
")\n";
116 auto const dbl_col =
convert_to_double( df_col.begin(), df_col.end(), df_col.size() );
124 assert( ip.first <= ip.second );
125 assert( ip.second == df.
rows() );
126 auto const iv = ip.second - ip.first;
129 ss << i <<
": \"" << df[i].name() <<
"\" (numerical, min: " << ( mm.min );
130 ss <<
", max: " << ( mm.max ) <<
", unused entries: " << iv <<
")\n";
139 std::vector<std::string>{},
140 std::vector<std::string>{
""}
147 for(
size_t j = 0; j < fact_df.cols(); ++j ) {
148 assert( fact_df[j].Dataframe::ColumnBase::is<double>() );
149 auto const& fact_col = fact_df[j].Dataframe::ColumnBase::as<
double>();
150 result.
add_col<
double>( df_col.name() +
"." + fact_col.name(), fact_col );
154 size_t empty_cnt = 0;
155 for(
auto const& c : df_col ) {
160 assert( empty_cnt <= df_col.size() );
163 ss << i <<
": \"" << df[i].name() <<
"\" (categorical, levels: ";
164 ss << fact.levels.size() <<
", unused entries: " << empty_cnt <<
")\n";
170 throw std::invalid_argument(
171 "Can only use Dataframe Columns of types double or std::string for preparing "
189 std::vector<std::string> row_order
194 if( ! row_order.empty() && row_order.size() != df.
rows() ) {
195 throw std::runtime_error(
"Row order has to be empty or of same size as Dataframe rows." );
197 auto const row_names = ( row_order.empty() ? df.
row_names() : row_order );
198 assert( row_names.size() == df.
rows() );
199 assert( result.rows() == df.
rows() );
200 assert( result.cols() == df.
cols() );
203 for(
size_t c = 0; c < df.
cols(); ++c ) {
204 if( ! df[c].Dataframe::ColumnBase::is<double>() ) {
205 throw std::runtime_error(
"GLM Dataframe conversion expects Columns of type double." );
207 auto const& col = df[c].Dataframe::ColumnBase::as<
double>();
210 for(
size_t r = 0; r < row_names.size(); ++r ) {
211 auto const& row_name = row_names[r];
213 throw std::runtime_error(
"Invalid row name in Dataframe for GLM conversion." );
216 result( r, c ) = col[ row_name ];