A library for working with phylogenetic and population genetic data.
v0.32.0
utils/math/regression/helper.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
31 /*
32  =================================================================
33  License
34  =================================================================
35 
36  The implementation is based on the snp.matrix and X.snp.matrix classes by
37  David Clayton <david.clayton@cimr.cam.ac.uk> and Hin-Tak Leung <htl10@users.sourceforge.net>.
38  The source is in C, but is originally intended for usage in R.
39  This file in particular is based on snpStats_1.32.0/src/mla.c
40  We massively refactored the original code, for example by using vectors and matrices instead
41  of pointers, and by using proper data structures instead of lists of in/out function parameters.
42 
43  The original code is published under the GNU General Public Licence version 3 (GPLv3).
44  We use the same license, hence see above for details.
45 
46  The snp.matrix and X.snp.matrix classes.
47  Copyright (C) 2008 David Clayton and Hin-Tak Leung
48 
49  The package does not seem to be maintained any more, and does not seem to
50  have a proper repository. For more information, try these sites:
51  https://bioconductor.org/packages/release/bioc/html/snpStats.html
52  https://www.rdocumentation.org/packages/snpStats/
53  http://www-gene.cimr.cam.ac.uk/clayton/software/
54 */
55 
57 
58 #include <cassert>
59 #include <cmath>
60 #include <cstdlib>
61 #include <limits>
62 #include <stdexcept>
63 
64 namespace genesis {
65 namespace utils {
66 
67 // =============================================================================
68 // Linear Algebra Helper Functions
69 // =============================================================================
70 
72  std::vector<double> const& y_input,
73  std::vector<double> const& weights,
74  std::vector<size_t> const& strata,
75  bool with_intercept,
76  bool centering,
77  std::vector<double>& y_output
78 ) {
79  // Prepare return value. Has reasonable defaults already.
80  GlmFreedom result;
81 
82  // Prepare result vector.
83  if( &y_output != &y_input ) {
84  y_output.resize( y_input.size() );
85  }
86  assert( y_output.size() == y_input.size() );
87 
88  // Check
89  if( ! weights.empty() && weights.size() != y_input.size() ) {
90  throw std::invalid_argument(
91  "weighted_residuals: y and weights need to have same length."
92  );
93  }
94 
95  if( strata.empty() ) {
96  if( ! with_intercept ) {
97  // Nothing to do ... if necessary copy input to output
98  if( &y_output != &y_input ) {
99  if( centering ) {
100  y_output = y_input;
101  } else {
102  y_output = std::vector<double>( y_input.size(), 0.0 );
103  }
104  }
105  return result;
106  }
107 
108  // Calcualte the (weighted) mean of the y values.
109  double swt = 0.0;
110  double swy = 0.0;
111  if( weights.empty() ) {
112  size_t swt_cnt = 0;
113  for( size_t i = 0; i < y_input.size(); ++i ) {
114  if( std::isfinite( y_input[i] )) {
115  swy += y_input[i];
116  ++swt_cnt;
117  }
118  }
119  swt = static_cast<double>( swt_cnt );
120  } else {
121  for( size_t i = 0; i < y_input.size(); ++i ) {
122  if( weights[i] < 0.0 ) {
123  throw std::runtime_error(
124  "weighted_mean_centering: weights have to be non-negative."
125  );
126  }
127  if( std::isfinite( weights[i] ) && std::isfinite( y_input[i] )) {
128  swy += weights[i] * y_input[i];
129  swt += weights[i];
130  }
131  }
132  }
133  assert( std::isfinite( swy ) );
134  assert( std::isfinite( swt ) );
135  assert( swt >= 0.0 );
136 
137  // Calculate the centring (or set to mean).
138  // Non-finite y values will simply stay non finite here - no need for extra checks.
139  if( swt > 0.0 ) {
140  swy /= swt;
141  assert( std::isfinite( swy ) );
142 
143  for( size_t i = 0; i < y_input.size(); ++i ) {
144  y_output[i] = ( centering ? y_input[i] - swy : swy );
145  }
146  } else {
147  result.empty_strata = 1;
148  }
149 
150  } else {
151  if( strata.size() != y_input.size() ) {
152  throw std::invalid_argument(
153  "weighted_centering: y and strata need to have same length."
154  );
155  }
156 
157  auto swy = std::vector<double>( strata.size(), 0.0 );
158  auto swt = std::vector<double>( strata.size(), 0.0 );
159 
160  // Error checking, and finding max stratum.
161  for( size_t s = 0; s < strata.size(); ++s ) {
162  if( strata[s] < 1 || strata[s] > strata.size() ) {
163  throw std::invalid_argument(
164  "weighted_residuals: invalid stratum value outside of [1,N] found."
165  );
166  }
167  if( strata[s] > result.max_stratum ) {
168  result.max_stratum = strata[s];
169  }
170  }
171 
172  // Calcualte the (weighted) mean of the y values per stratum.
173  if( weights.empty() ) {
174  for( size_t i = 0; i < y_input.size(); ++i ) {
175  if( std::isfinite( y_input[i] )) {
176  size_t const s = strata[i] - 1;
177  swy[s] += y_input[i];
178  swt[s] += 1.0;
179  }
180  }
181  } else {
182  for( size_t i = 0; i < y_input.size(); ++i ) {
183  if( weights[i] < 0.0 ) {
184  throw std::runtime_error(
185  "weighted_mean_centering: weights have to be non-negative."
186  );
187  }
188  if( std::isfinite( weights[i] ) && std::isfinite( y_input[i] )) {
189  size_t const s = strata[i] - 1;
190  swy[s] += weights[i] * y_input[i];
191  swt[s] += weights[i];
192  }
193  }
194  }
195  for( size_t s = 0; s < strata.size(); ++s ) {
196  assert( std::isfinite( swy[s] ) );
197  assert( std::isfinite( swt[s] ) );
198  assert( swt[s] >= 0.0 );
199 
200  if( swt[s] > 0.0 ) {
201  swy[s] /= swt[s];
202  } else {
203  ++result.empty_strata;
204  }
205  assert( std::isfinite( swy[s] ) );
206  }
207 
208  // Calculate the centring (or set to mean).
209  // Again, non-finite y values will simply stay non finite here - no need for extra checks.
210  for( size_t i = 0; i < y_input.size(); ++i ) {
211  size_t const s = strata[i] - 1;
212  if( swt[s] > 0.0 ) {
213  y_output[i] = ( centering ? y_input[i] - swy[s] : swy[s] );
214  }
215  }
216  }
217 
218  return result;
219 }
220 
222  std::vector<double> const& x_input,
223  std::vector<double> const& y_input,
224  std::vector<double> const& weights,
225  std::vector<double>& y_output
226 ) {
227  if( x_input.size() != y_input.size() ) {
228  throw std::invalid_argument(
229  "weighted_residuals: x and y need to have same length."
230  );
231  }
232 
233  double swxx = 0.0;
234  double swxy = 0.0;
235 
236  if( weights.empty() ) {
237  for( size_t i = 0; i < x_input.size(); ++i ) {
238  if( std::isfinite( x_input[i] ) && std::isfinite( y_input[i] )) {
239  swxy += x_input[i] * y_input[i];
240  swxx += x_input[i] * x_input[i];
241  }
242  }
243  } else {
244  if( weights.size() != x_input.size() ) {
245  throw std::invalid_argument(
246  "weighted_residuals: x and weights need to have same length."
247  );
248  }
249 
250  for( size_t i = 0; i < x_input.size(); ++i ) {
251  if( weights[i] < 0.0 ) {
252  throw std::runtime_error(
253  "weighted_residuals: weights have to be non-negative."
254  );
255  }
256  if( std::isfinite( x_input[i] )
257  && std::isfinite( y_input[i] )
258  && std::isfinite( weights[i] )
259  ) {
260  double const wx = weights[i] * x_input[i];
261  swxy += wx * y_input[i];
262  swxx += wx * x_input[i];
263  }
264  }
265  }
266  assert( std::isfinite( swxx ));
267  assert( std::isfinite( swxy ));
268 
269  if( &y_output != &y_input ) {
270  y_output.resize( y_input.size() );
271  }
272  assert( y_output.size() == y_input.size() );
273  if( swxx > 0.0 ) {
274  swxy /= swxx;
275  for( size_t i = 0; i < x_input.size(); ++i ) {
276  y_output[i] = y_input[i] - swxy * x_input[i];
277  }
278 
279  return swxy;
280  } else {
281  if( &y_output != &y_input ) {
282  for( size_t i = 0; i < x_input.size(); ++i ) {
283  y_output[i] = y_input[i];
284  }
285  }
286  return std::numeric_limits<double>::quiet_NaN();
287  }
288 }
289 
291  std::vector<double> const& x_input,
292  std::vector<double> const& weights
293 ) {
294  double res = 0.0;
295  if( weights.empty() ) {
296  for( size_t i = 0; i < x_input.size(); ++i ) {
297  if( std::isfinite( x_input[i] )) {
298  res += x_input[i] * x_input[i];
299  }
300  }
301  } else {
302  if( weights.size() != x_input.size() ) {
303  throw std::invalid_argument(
304  "weighted_sum: x and weights need to have same length."
305  );
306  }
307  for( size_t i = 0; i < x_input.size(); ++i ) {
308  if( weights[i] < 0.0 ) {
309  throw std::runtime_error(
310  "weighted_sum_of_squares: weights have to be non-negative."
311  );
312  }
313  if( std::isfinite( x_input[i] ) && std::isfinite( weights[i] )) {
314  res += weights[i] * x_input[i] * x_input[i];
315  }
316  }
317  }
318  assert( std::isfinite( res ));
319  return res;
320 }
321 
323  std::vector<double> const& x_input,
324  std::vector<double> const& y_input,
325  std::vector<double> const& weights
326 ) {
327  if( x_input.size() != y_input.size() ) {
328  throw std::invalid_argument(
329  "weighted_inner_product: x and y need to have same length."
330  );
331  }
332 
333  double res = 0.0;
334  if( weights.empty() ) {
335  for( size_t i = 0; i < y_input.size(); ++i ) {
336  if( std::isfinite( y_input[i] ) && std::isfinite( x_input[i] )) {
337  res += y_input[i] * x_input[i];
338  }
339  }
340  } else {
341  if( weights.size() != y_input.size() ) {
342  throw std::invalid_argument(
343  "weighted_inner_product: x and weights need to have same length."
344  );
345  }
346  for( size_t i = 0; i < y_input.size(); ++i ) {
347  if( weights[i] < 0.0 ) {
348  throw std::runtime_error(
349  "weighted_inner_product: weights have to be non-negative."
350  );
351  }
352  if( std::isfinite( weights[i] )
353  && std::isfinite( y_input[i] )
354  && std::isfinite( x_input[i] )
355  ) {
356  res += weights[i] * y_input[i] * x_input[i];
357  }
358  }
359  }
360  assert( std::isfinite( res ));
361  return res;
362 }
363 
365  std::vector<double> const& x_input,
366  std::vector<double> const& weights
367 ) {
368  double res = 0.0;
369  if( weights.empty() ) {
370  for( size_t i = 0; i < x_input.size(); ++i ) {
371  if( std::isfinite( x_input[i] )) {
372  res += x_input[i];
373  }
374  }
375  } else {
376  if( weights.size() != x_input.size() ) {
377  throw std::invalid_argument(
378  "weighted_sum: x and weights need to have same length."
379  );
380  }
381  for( size_t i = 0; i < x_input.size(); ++i ) {
382  if( weights[i] < 0.0 ) {
383  throw std::runtime_error(
384  "weighted_sum: weights have to be non-negative."
385  );
386  }
387  if( std::isfinite( weights[i] ) && std::isfinite( x_input[i] )) {
388  res += weights[i] * x_input[i];
389  }
390  }
391  }
392  assert( std::isfinite( res ));
393  return res;
394 }
395 
396 } // namespace utils
397 } // namespace genesis
genesis::utils::weighted_inner_product
double weighted_inner_product(std::vector< double > const &x_input, std::vector< double > const &y_input, std::vector< double > const &weights)
(Weighted) inner product of two vectors.
Definition: utils/math/regression/helper.cpp:322
genesis::utils::weighted_mean_centering
GlmFreedom weighted_mean_centering(std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< size_t > const &strata, bool with_intercept, bool centering, std::vector< double > &y_output)
(Weighted) mean and centering.
Definition: utils/math/regression/helper.cpp:71
genesis::utils::weighted_residuals
double weighted_residuals(std::vector< double > const &x_input, std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< double > &y_output)
Calculate the residuals from (weighted) regression through the origin.
Definition: utils/math/regression/helper.cpp:221
genesis::utils::GlmFreedom
Internal helper structure for GLMs to calcualte the residual degrees of freedom.
Definition: utils/math/regression/helper.hpp:47
helper.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::GlmFreedom::max_stratum
size_t max_stratum
Maximum stratum found (S).
Definition: utils/math/regression/helper.hpp:62
genesis::utils::weighted_sum_of_squares
double weighted_sum_of_squares(std::vector< double > const &x_input, std::vector< double > const &weights)
(Weighted) sum of squares.
Definition: utils/math/regression/helper.cpp:290
genesis::utils::GlmFreedom::empty_strata
size_t empty_strata
Number of empty strata.
Definition: utils/math/regression/helper.hpp:57
genesis::utils::weighted_sum
double weighted_sum(std::vector< double > const &x_input, std::vector< double > const &weights)
(Weighted) sum of a vector of values.
Definition: utils/math/regression/helper.cpp:364