A toolkit for working with phylogenetic data.
v0.24.0
factor.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_MATH_REGRESSION_FACTOR_H_
2 #define GENESIS_UTILS_MATH_REGRESSION_FACTOR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
35 
36 #include <algorithm>
37 #include <cassert>
38 #include <limits>
39 #include <stdexcept>
40 #include <sstream>
41 #include <vector>
42 
43 namespace genesis {
44 namespace utils {
45 
46 // =================================================================================================
47 // Factors and Categorical Variables
48 // =================================================================================================
49 
50 template<class T>
51 struct GlmFactor
52 {
57  std::vector<T> levels;
58 
67  std::vector<size_t> values;
68 };
69 
87 template <class ForwardIterator>
89  ForwardIterator first,
90  ForwardIterator last,
91  std::vector<typename ForwardIterator::value_type> const& levels,
92  std::vector<typename ForwardIterator::value_type> const& exclude
93 ) {
94  // Prepare result.
95  using T = typename ForwardIterator::value_type;
96  GlmFactor<T> result;
97 
98  // If no levels are provided, add all first.
99  if( levels.empty() ) {
100  auto it = first;
101  while( it != last ) {
102  result.levels.push_back( *it );
103  ++it;
104  }
105 
106  // Sort and make levels uniq.
107  // We don't do this if levels are given as param, because then we just want to keep
108  // they order they are already in.
109  std::sort( result.levels.begin(), result.levels.end() );
110  auto uit = std::unique( result.levels.begin(), result.levels.end() );
111  result.levels.erase( uit, result.levels.end() );
112  } else {
113 
114  // Test if the provided levels are uniq.
115  result.levels = levels;
116  std::sort( result.levels.begin(), result.levels.end() );
117  auto uit = std::unique( result.levels.begin(), result.levels.end() );
118  if( uit != result.levels.end() ) {
119  throw std::invalid_argument( "Provided levels are not unique." );
120  }
121 
122  // Now set again to the actual level order. Ineffcient, but better we do the check above.
123  result.levels = levels;
124  }
125 
126  // Remove the excluded ones again.
127  for( auto const& e : exclude ) {
128  auto fit = std::find( result.levels.begin(), result.levels.end(), e );
129  if( fit != result.levels.end() ) {
130  result.levels.erase( fit );
131  }
132  }
133 
134  // Fill the values. This is inefficient because of the repeated lookup of levels,
135  // but for now, good enough, because the set of levels is usually small.
136  auto it = first;
137  while( it != last ) {
138  auto fit = std::find( result.levels.begin(), result.levels.end(), *it );
139  if( fit != result.levels.end() ) {
140  result.values.push_back( std::distance( result.levels.begin(), fit ));
141  } else {
142  // Value not found.
143  result.values.push_back( std::numeric_limits<std::size_t>::max() );
144  }
145  ++it;
146  }
147 
148  return result;
149 }
150 
151 template <class ForwardIterator>
153  ForwardIterator first,
154  ForwardIterator last,
155  std::vector<typename ForwardIterator::value_type> const& levels
156 ) {
157  return glm_factor(
158  first,
159  last,
160  levels,
161  std::vector<typename ForwardIterator::value_type>()
162  );
163 }
164 
165 template <class ForwardIterator>
167  ForwardIterator first,
168  ForwardIterator last
169 ) {
170  return glm_factor(
171  first,
172  last,
173  std::vector<typename ForwardIterator::value_type>(),
174  std::vector<typename ForwardIterator::value_type>()
175  );
176 }
177 
192 template<class T>
193 std::vector<size_t> glm_factor_summary( GlmFactor<T> const& factor )
194 {
195  auto result = std::vector<size_t>( factor.levels.size(), 0 );
196  for( auto val : factor.values ) {
197  if( val < factor.levels.size() ) {
198  ++result[val];
199  }
200  }
201  return result;
202 }
203 
213 template<class T>
215  GlmFactor<T> const& factor,
216  T const& reference_level,
217  std::vector<std::string> const& row_names = std::vector<std::string>{}
218 ) {
219  // Error checks.
220  if( factor.levels.empty() ) {
221  throw std::runtime_error( "Cannot create indicator variable from empty factor." );
222  }
223  if( ! row_names.empty() && row_names.size() != factor.values.size() ) {
224  throw std::runtime_error(
225  "Row names for indicator variable do not have the same size as the values of the factor."
226  );
227  }
228 
229  // We need to find the ref level in the factor levels.
230  // This is a bit wasteful, and we could instead use its index as parameter,
231  // but this way, the api is nicer. It might be possible to offer both versions,
232  // but that could get tricky if T is some integer type, because then we get identical signatures.
233  // So, for now, we opt for usability instead of efficiency here.
234  // Level lists are usually small, so that should not be a big issue.
235  auto const rit = std::find( factor.levels.begin(), factor.levels.end(), reference_level );
236  if( rit == factor.levels.end() ) {
237  throw std::runtime_error(
238  "Cannot create indicator variable. "
239  "Provided reference level is not part of the factor levels."
240  );
241  }
242  auto const ref_idx = std::distance( factor.levels.begin(), rit );
243  assert( ref_idx >= 0 );
244 
245  // Prepare result, add all needed rows.
246  Dataframe result;
247  if( row_names.empty() ) {
248  for( size_t i = 0; i < factor.values.size(); ++i ) {
249  result.add_unnamed_row();
250  }
251  } else {
252  assert( row_names.size() == factor.values.size() );
253  for( size_t i = 0; i < factor.values.size(); ++i ) {
254  result.add_row( row_names[i] );
255  }
256  }
257  assert( result.rows() == factor.values.size() );
258 
259  // Make indicator variables for each but the reference level.
260  for( size_t lvl_idx = 0; lvl_idx < factor.levels.size(); ++lvl_idx ) {
261  if( lvl_idx == static_cast<size_t>( ref_idx )) {
262  continue;
263  }
264 
265  // Make column name by concatenating the ref level and current level.
266  auto make_name = []( T const& level ){
267  std::stringstream ss;
268  ss << level;
269  auto ret = ss.str();
270  if( ret.empty() ) {
271  ret = "[empty]";
272  }
273  return ret;
274  };
275  auto col_name = make_name( factor.levels[ ref_idx ] ) + "." + make_name( factor.levels[ lvl_idx ] );
276  auto& col = result.add_col<double>( col_name );
277 
278  // Add the column content.
279  for( size_t val_idx = 0; val_idx < factor.values.size(); ++val_idx ) {
280  if( factor.values[val_idx] >= factor.levels.size() ) {
281  col[val_idx] = std::numeric_limits<double>::quiet_NaN();
282  } else if( factor.values[val_idx] == lvl_idx ) {
283  col[val_idx] = 1.0;
284  } else {
285  col[val_idx] = 0.0;
286  }
287  }
288  }
289  assert( factor.levels.size() > 0 );
290  assert( result.cols() == factor.levels.size() - 1 );
291 
292  return result;
293 }
294 
301 template<class T>
303  GlmFactor<T> const& factor,
304  std::vector<std::string> const& row_names = std::vector<std::string>{}
305 ) {
306  if( factor.levels.empty() ) {
307  throw std::runtime_error( "Cannot create indicator variable from empty factor." );
308  }
309 
310  // Find the most common value.
311  auto const smry = glm_factor_summary( factor );
312  auto const maxit = std::max_element( smry.begin(), smry.end() );
313  size_t const max_level = std::distance( smry.begin(), maxit );
314  assert( max_level < factor.levels.size() );
315 
316  return glm_indicator_variables( factor, factor.levels[max_level], row_names );
317 }
318 
319 } // namespace utils
320 } // namespace genesis
321 
322 #endif // include guard
std::vector< size_t > values
List of factor indices for the original values.
Definition: factor.hpp:67
Column< T > & add_col(std::string const &name)
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::vector< size_t > glm_factor_summary(GlmFactor< T > const &factor)
Get the number of occurrences of each level in a GlmFactor.
Definition: factor.hpp:193
std::vector< T > levels
Set of unique values of the factor. The indices in this vector are the indices that are used in value...
Definition: factor.hpp:57
self_type & add_row(std::string const &name)
GlmFactor< typename ForwardIterator::value_type > glm_factor(ForwardIterator first, ForwardIterator last, std::vector< typename ForwardIterator::value_type > const &levels, std::vector< typename ForwardIterator::value_type > const &exclude)
Reduce a list of values in the given range to a set of unique factors.
Definition: factor.hpp:88
Dataframe glm_indicator_variables(GlmFactor< T > const &factor, T const &reference_level, std::vector< std::string > const &row_names=std::vector< std::string >{})
Turn a GlmFactor into a set of (dummy) indicator variables to be used in regression.
Definition: factor.hpp:214