A toolkit for working with phylogenetic data.
v0.24.0
utils/math/regression/helper.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
31 /*
32  =================================================================
33  License
34  =================================================================
35 
36  The implementation is based on the snp.matrix and X.snp.matrix classes by
37  David Clayton <david.clayton@cimr.cam.ac.uk> and Hin-Tak Leung <htl10@users.sourceforge.net>.
38  The source is in C, but is originally intended for usage in R.
39  This file in particular is based on snpStats_1.32.0/src/mla.c
40  We massively refactored the original code, for example by using vectors and matrices instead
41  of pointers, and by using proper data structures instead of lists of in/out function parameters.
42 
43  The original code is published under the GNU General Public Licence version 3 (GPLv3).
44  We use the same license, hence see above for details.
45 
46  The snp.matrix and X.snp.matrix classes.
47  Copyright (C) 2008 David Clayton and Hin-Tak Leung
48 
49  The package does not seem to be maintained any more, and does not seem to
50  have a proper repository. For more information, try these sites:
51  https://bioconductor.org/packages/release/bioc/html/snpStats.html
52  https://www.rdocumentation.org/packages/snpStats/
53  http://www-gene.cimr.cam.ac.uk/clayton/software/
54 */
55 
57 
58 #include <cassert>
59 #include <cmath>
60 #include <cstdlib>
61 #include <limits>
62 #include <stdexcept>
63 
64 namespace genesis {
65 namespace utils {
66 
67 // =============================================================================
68 // Linear Algebra Helper Functions
69 // =============================================================================
70 
72  std::vector<double> const& y_input,
73  std::vector<double> const& weights,
74  std::vector<size_t> const& strata,
75  bool with_intercept,
76  bool centering,
77  std::vector<double>& y_output
78 ) {
79  // Prepare return value. Has reasonable defaults already.
80  GlmFreedom result;
81 
82  // Prepare result vector.
83  if( &y_output != &y_input ) {
84  y_output.resize( y_input.size() );
85  }
86  assert( y_output.size() == y_input.size() );
87 
88  // Check
89  if( ! weights.empty() && weights.size() != y_input.size() ) {
90  throw std::invalid_argument(
91  "weighted_residuals: y and weights need to have same length."
92  );
93  }
94 
95  if( strata.empty() ) {
96  if( ! with_intercept ) {
97  // Nothing to do ... if necessary copy input to output
98  if( &y_output != &y_input ) {
99  if( centering ) {
100  y_output = y_input;
101  } else {
102  y_output = std::vector<double>( y_input.size(), 0.0 );
103  }
104  }
105  return result;
106  }
107 
108  // Calcualte the (weighted) mean of the y values.
109  double swt = 0.0;
110  double swy = 0.0;
111  if( weights.empty() ) {
112  size_t swt_cnt = 0;
113  for( size_t i = 0; i < y_input.size(); ++i ) {
114  if( std::isfinite( y_input[i] )) {
115  swy += y_input[i];
116  ++swt_cnt;
117  }
118  }
119  swt = static_cast<double>( swt_cnt );
120  } else {
121  for( size_t i = 0; i < y_input.size(); ++i ) {
122  if( weights[i] < 0.0 ) {
123  throw std::runtime_error(
124  "weighted_mean_centering: weights have to be non-negative."
125  );
126  }
127  if( std::isfinite( weights[i] ) && std::isfinite( y_input[i] )) {
128  swy += weights[i] * y_input[i];
129  swt += weights[i];
130  }
131  }
132  }
133  assert( std::isfinite( swy ) );
134  assert( std::isfinite( swt ) );
135  assert( swt >= 0.0 );
136 
137  // Calculate the centring (or set to mean).
138  // Non-finite y values will simply stay non finite here - no need for extra checks.
139  if( swt > 0.0 ) {
140  swy /= swt;
141  assert( std::isfinite( swy ) );
142 
143  for( size_t i = 0; i < y_input.size(); ++i ) {
144  y_output[i] = ( centering ? y_input[i] - swy : swy );
145  }
146  } else {
147  result.empty_strata = 1;
148  }
149 
150  } else {
151  if( strata.size() != y_input.size() ) {
152  throw std::invalid_argument(
153  "weighted_centering: y and strata need to have same length."
154  );
155  }
156 
157  auto swy = std::vector<double>( strata.size(), 0.0 );
158  auto swt = std::vector<double>( strata.size(), 0.0 );
159 
160  // Error checking, and finding max stratum.
161  for( size_t s = 0; s < strata.size(); ++s ) {
162  if( strata[s] < 1 || strata[s] > strata.size() ) {
163  throw std::invalid_argument(
164  "weighted_residuals: invalid stratum value outside of [1,N] found."
165  );
166  }
167  if( strata[s] > result.max_stratum ) {
168  result.max_stratum = strata[s];
169  }
170  }
171 
172  // Calcualte the (weighted) mean of the y values per stratum.
173  if( weights.empty() ) {
174  for( size_t i = 0; i < y_input.size(); ++i ) {
175  if( std::isfinite( y_input[i] )) {
176  size_t const s = strata[i] - 1;
177  swy[s] += y_input[i];
178  swt[s] += 1.0;
179  }
180  }
181  } else {
182  for( size_t i = 0; i < y_input.size(); ++i ) {
183  if( weights[i] < 0.0 ) {
184  throw std::runtime_error(
185  "weighted_mean_centering: weights have to be non-negative."
186  );
187  }
188  if( std::isfinite( weights[i] ) && std::isfinite( y_input[i] )) {
189  size_t const s = strata[i] - 1;
190  swy[s] += weights[i] * y_input[i];
191  swt[s] += weights[i];
192  }
193  }
194  }
195  for( size_t s = 0; s < strata.size(); ++s ) {
196  assert( std::isfinite( swy[s] ) );
197  assert( std::isfinite( swt[s] ) );
198  assert( swt[s] >= 0.0 );
199 
200  if( swt[s] > 0.0 ) {
201  swy[s] /= swt[s];
202  } else {
203  ++result.empty_strata;
204  }
205  assert( std::isfinite( swy[s] ) );
206  }
207 
208  // Calculate the centring (or set to mean).
209  // Again, non-finite y values will simply stay non finite here - no need for extra checks.
210  for( size_t i = 0; i < y_input.size(); ++i ) {
211  size_t const s = strata[i] - 1;
212  if( swt[s] > 0.0 ) {
213  y_output[i] = ( centering ? y_input[i] - swy[s] : swy[s] );
214  }
215  }
216  }
217 
218  return result;
219 }
220 
222  std::vector<double> const& x_input,
223  std::vector<double> const& y_input,
224  std::vector<double> const& weights,
225  std::vector<double>& y_output
226 ) {
227  if( x_input.size() != y_input.size() ) {
228  throw std::invalid_argument(
229  "weighted_residuals: x and y need to have same length."
230  );
231  }
232 
233  double swxx = 0.0;
234  double swxy = 0.0;
235 
236  if( weights.empty() ) {
237  for( size_t i = 0; i < x_input.size(); ++i ) {
238  if( std::isfinite( x_input[i] ) && std::isfinite( y_input[i] )) {
239  swxy += x_input[i] * y_input[i];
240  swxx += x_input[i] * x_input[i];
241  }
242  }
243  } else {
244  if( weights.size() != x_input.size() ) {
245  throw std::invalid_argument(
246  "weighted_residuals: x and weights need to have same length."
247  );
248  }
249 
250  for( size_t i = 0; i < x_input.size(); ++i ) {
251  if( weights[i] < 0.0 ) {
252  throw std::runtime_error(
253  "weighted_residuals: weights have to be non-negative."
254  );
255  }
256  if( std::isfinite( x_input[i] )
257  && std::isfinite( y_input[i] )
258  && std::isfinite( weights[i] )
259  ) {
260  double const wx = weights[i] * x_input[i];
261  swxy += wx * y_input[i];
262  swxx += wx * x_input[i];
263  }
264  }
265  }
266  assert( std::isfinite( swxx ));
267  assert( std::isfinite( swxy ));
268 
269  if( &y_output != &y_input ) {
270  y_output.resize( y_input.size() );
271  }
272  assert( y_output.size() == y_input.size() );
273  if( swxx > 0.0 ) {
274  swxy /= swxx;
275  for( size_t i = 0; i < x_input.size(); ++i ) {
276  y_output[i] = y_input[i] - swxy * x_input[i];
277  }
278 
279  return swxy;
280  } else {
281  if( &y_output != &y_input ) {
282  for( size_t i = 0; i < x_input.size(); ++i ) {
283  y_output[i] = y_input[i];
284  }
285  }
286  return std::numeric_limits<double>::quiet_NaN();
287  }
288 }
289 
291  std::vector<double> const& x_input,
292  std::vector<double> const& weights
293 ) {
294  double res = 0.0;
295  if( weights.empty() ) {
296  for( size_t i = 0; i < x_input.size(); ++i ) {
297  if( std::isfinite( x_input[i] )) {
298  res += x_input[i] * x_input[i];
299  }
300  }
301  } else {
302  if( weights.size() != x_input.size() ) {
303  throw std::invalid_argument(
304  "weighted_sum: x and weights need to have same length."
305  );
306  }
307  for( size_t i = 0; i < x_input.size(); ++i ) {
308  if( weights[i] < 0.0 ) {
309  throw std::runtime_error(
310  "weighted_sum_of_squares: weights have to be non-negative."
311  );
312  }
313  if( std::isfinite( x_input[i] ) && std::isfinite( weights[i] )) {
314  res += weights[i] * x_input[i] * x_input[i];
315  }
316  }
317  }
318  assert( std::isfinite( res ));
319  return res;
320 }
321 
323  std::vector<double> const& x_input,
324  std::vector<double> const& y_input,
325  std::vector<double> const& weights
326 ) {
327  if( x_input.size() != y_input.size() ) {
328  throw std::invalid_argument(
329  "weighted_inner_product: x and y need to have same length."
330  );
331  }
332 
333  double res = 0.0;
334  if( weights.empty() ) {
335  for( size_t i = 0; i < y_input.size(); ++i ) {
336  if( std::isfinite( y_input[i] ) && std::isfinite( x_input[i] )) {
337  res += y_input[i] * x_input[i];
338  }
339  }
340  } else {
341  if( weights.size() != y_input.size() ) {
342  throw std::invalid_argument(
343  "weighted_inner_product: x and weights need to have same length."
344  );
345  }
346  for( size_t i = 0; i < y_input.size(); ++i ) {
347  if( weights[i] < 0.0 ) {
348  throw std::runtime_error(
349  "weighted_inner_product: weights have to be non-negative."
350  );
351  }
352  if( std::isfinite( weights[i] )
353  && std::isfinite( y_input[i] )
354  && std::isfinite( x_input[i] )
355  ) {
356  res += weights[i] * y_input[i] * x_input[i];
357  }
358  }
359  }
360  assert( std::isfinite( res ));
361  return res;
362 }
363 
365  std::vector<double> const& x_input,
366  std::vector<double> const& weights
367 ) {
368  double res = 0.0;
369  if( weights.empty() ) {
370  for( size_t i = 0; i < x_input.size(); ++i ) {
371  if( std::isfinite( x_input[i] )) {
372  res += x_input[i];
373  }
374  }
375  } else {
376  if( weights.size() != x_input.size() ) {
377  throw std::invalid_argument(
378  "weighted_sum: x and weights need to have same length."
379  );
380  }
381  for( size_t i = 0; i < x_input.size(); ++i ) {
382  if( weights[i] < 0.0 ) {
383  throw std::runtime_error(
384  "weighted_sum: weights have to be non-negative."
385  );
386  }
387  if( std::isfinite( weights[i] ) && std::isfinite( x_input[i] )) {
388  res += weights[i] * x_input[i];
389  }
390  }
391  }
392  assert( std::isfinite( res ));
393  return res;
394 }
395 
396 } // namespace utils
397 } // namespace genesis
double weighted_sum_of_squares(std::vector< double > const &x_input, std::vector< double > const &weights)
(Weighted) sum of squares.
size_t max_stratum
Maximum stratum found (S).
double weighted_residuals(std::vector< double > const &x_input, std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< double > &y_output)
Calculate the residuals from (weighted) regression through the origin.
Internal helper structure for GLMs to calcualte the residual degrees of freedom.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
size_t empty_strata
Number of empty strata.
GlmFreedom weighted_mean_centering(std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< size_t > const &strata, bool with_intercept, bool centering, std::vector< double > &y_output)
(Weighted) mean and centering.
double weighted_sum(std::vector< double > const &x_input, std::vector< double > const &weights)
(Weighted) sum of a vector of values.
double weighted_inner_product(std::vector< double > const &x_input, std::vector< double > const &y_input, std::vector< double > const &weights)
(Weighted) inner product of two vectors.