Mantid
Loading...
Searching...
No Matches
Statistics.cpp
Go to the documentation of this file.
1// Mantid Repository : https://github.com/mantidproject/mantid
2//
3// Copyright © 2018 ISIS Rutherford Appleton Laboratory UKRI,
4// NScD Oak Ridge National Laboratory, European Spallation Source,
5// Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS
6// SPDX - License - Identifier: GPL - 3.0 +
7// Includes
10
11#include <boost/accumulators/accumulators.hpp>
12#include <boost/accumulators/statistics/max.hpp>
13#include <boost/accumulators/statistics/min.hpp>
14#include <boost/accumulators/statistics/stats.hpp>
15#include <boost/accumulators/statistics/variance.hpp>
16
17#include <algorithm>
18#include <cfloat>
19#include <cmath>
20#include <limits>
21#include <sstream>
22
23namespace Mantid::Kernel {
24namespace {
25Logger logger("Statistics");
26
27void assertMomentIsValid(const int maxMoment) {
28 if (maxMoment < 0) {
29 std::stringstream msg;
30 msg << "moment = " << maxMoment << " is statistically meaningless";
31 throw std::runtime_error(msg.str());
32 }
33}
34} // namespace
35
36using std::string;
37using std::vector;
38
40 constexpr double nan = std::numeric_limits<double>::quiet_NaN();
41 minimum = nan;
42 maximum = nan;
43 mean = nan;
44 median = nan;
46}
47
52template <typename TYPE> double getMedian(const vector<TYPE> &data) {
53 const size_t size = data.size();
54 if (size == 1)
55 return static_cast<double>(data[0]);
56
57 const bool isSorted = std::is_sorted(data.cbegin(), data.cend());
58 const bool is_even = (size % 2 == 0);
59 if (isSorted) {
60 if (is_even)
61 return (static_cast<double>(data[size / 2 - 1]) + static_cast<double>(data[size / 2])) / 2;
62 else
63 return static_cast<double>(data[size / 2]);
64 } else {
65 std::vector<TYPE> tmpSortedData(data);
66 std::sort(tmpSortedData.begin(), tmpSortedData.end());
67 if (is_even)
68 return (static_cast<double>(tmpSortedData[size / 2 - 1]) + static_cast<double>(tmpSortedData[size / 2])) / 2;
69 else
70 return static_cast<double>(tmpSortedData[size / 2]);
71 }
72}
73
78template <typename TYPE> std::vector<double> getZscore(const vector<TYPE> &data) {
79 std::vector<double> Zscore;
80 if (data.size() < 3) {
81 Zscore.resize(data.size(), 0.);
82 return Zscore;
83 }
84 Statistics stats = getStatistics(data);
85 if (stats.standard_deviation == 0.) {
86 Zscore.resize(data.size(), 0.);
87 return Zscore;
88 }
89 for (auto it = data.cbegin(); it != data.cend(); ++it) {
90 auto tmp = static_cast<double>(*it);
91 // unclear why Zscore is non-negative, was first implemented in #5316
92 Zscore.emplace_back(fabs((stats.mean - tmp) / stats.standard_deviation));
93 }
94 return Zscore;
95}
100template <typename TYPE> std::vector<double> getWeightedZscore(const vector<TYPE> &data, const vector<TYPE> &weights) {
101 std::vector<double> Zscore;
102 if (data.size() < 3) {
103 Zscore.resize(data.size(), 0.);
104 return Zscore;
105 }
106 Statistics stats = getStatistics(data);
107 if (stats.standard_deviation == 0.) {
108 Zscore.resize(data.size(), 0.);
109 return Zscore;
110 }
111 double sumWeights = 0.0;
112 double sumWeightedData = 0.0;
113 double weightedVariance = 0.0;
114 for (size_t it = 0; it != data.size(); ++it) {
115 sumWeights += static_cast<double>(weights[it]);
116 sumWeightedData += static_cast<double>(weights[it] * data[it]);
117 }
118 double weightedMean = sumWeightedData / sumWeights;
119 for (size_t it = 0; it != data.size(); ++it) {
120 weightedVariance += std::pow(static_cast<double>(data[it]) - weightedMean, 2) *
121 std::pow(static_cast<double>(weights[it]) / sumWeights, 2);
122 }
123 for (auto it = data.cbegin(); it != data.cend(); ++it) {
124 Zscore.emplace_back(fabs((static_cast<double>(*it) - weightedMean) / std::sqrt(weightedVariance)));
125 }
126 return Zscore;
127}
133template <typename TYPE> std::vector<double> getModifiedZscore(const vector<TYPE> &data) {
134 if (data.size() < 3) {
135 std::vector<double> Zscore(data.size(), 0.);
136 return Zscore;
137 }
138 std::vector<double> MADvec;
139 double tmp;
140 double median = getMedian(data);
141 for (auto it = data.cbegin(); it != data.cend(); ++it) {
142 tmp = static_cast<double>(*it);
143 MADvec.emplace_back(fabs(tmp - median));
144 }
145 double MAD = getMedian(MADvec);
146 if (MAD == 0.) {
147 std::vector<double> Zscore(data.size(), 0.);
148 return Zscore;
149 }
150 MADvec.clear();
151 std::vector<double> Zscore;
152 for (auto it = data.begin(); it != data.end(); ++it) {
153 tmp = static_cast<double>(*it);
154 Zscore.emplace_back(0.6745 * fabs((tmp - median) / MAD));
155 }
156 return Zscore;
157}
158
165template <typename TYPE> Statistics getStatistics(const vector<TYPE> &data, const unsigned int flags) {
166 Statistics statistics;
167 if (data.empty()) { // don't do anything
168 return statistics;
169 }
170 // calculate the mean if this or the stddev is requested
171 const bool stddev = ((flags & StatOptions::UncorrectedStdDev) || (flags & StatOptions::CorrectedStdDev));
172 if (stddev) {
173 using namespace boost::accumulators;
174 accumulator_set<double, stats<tag::min, tag::max, tag::variance>> acc;
175 for (auto &value : data) {
176 acc(static_cast<double>(value));
177 }
178 statistics.minimum = min(acc);
179 statistics.maximum = max(acc);
180 statistics.mean = mean(acc);
181 double var = variance(acc);
182
183 if (flags & StatOptions::CorrectedStdDev) {
184 auto ndofs = static_cast<double>(data.size());
185 var *= ndofs / (ndofs - 1.0);
186 }
187 statistics.standard_deviation = std::sqrt(var);
188
189 } else if (flags & StatOptions::Mean) {
190 using namespace boost::accumulators;
191 accumulator_set<double, stats<tag::mean>> acc;
192 for (auto &value : data) {
193 acc(static_cast<double>(value));
194 }
195 statistics.mean = mean(acc);
196 }
197
198 // calculate the median if requested
199 if (flags & StatOptions::Median) {
200 statistics.median = getMedian(data);
201 }
202
203 return statistics;
204}
205
207template <> DLLExport Statistics getStatistics<string>(const vector<string> &data, const unsigned int flags) {
208 UNUSED_ARG(flags);
209 UNUSED_ARG(data);
210 return Statistics(); // default is all nan
211}
212
214template <> DLLExport Statistics getStatistics<bool>(const vector<bool> &data, const unsigned int flags) {
215 UNUSED_ARG(flags);
216 UNUSED_ARG(data);
217 return Statistics(); // default is all nan
218}
219
227Rfactor getRFactor(const std::vector<double> &obsI, const std::vector<double> &calI, const std::vector<double> &obsE) {
228 // 1. Check
229 if (obsI.size() != calI.size() || obsI.size() != obsE.size()) {
230 std::stringstream errss;
231 errss << "GetRFactor() Input Error! Observed Intensity (" << obsI.size() << "), Calculated Intensity ("
232 << calI.size() << ") and Observed Error (" << obsE.size() << ") have different number of elements.";
233 throw std::runtime_error(errss.str());
234 }
235 if (obsI.empty()) {
236 throw std::runtime_error("getRFactor(): the input arrays are empty.");
237 }
238
239 double sumnom = 0;
240 double sumdenom = 0;
241 double sumrpnom = 0;
242 double sumrpdenom = 0;
243
244 size_t numpts = obsI.size();
245 for (size_t i = 0; i < numpts; ++i) {
246 double cal_i = calI[i];
247 double obs_i = obsI[i];
248 double sigma = obsE[i];
249 double weight = 1.0 / (sigma * sigma);
250 double diff = obs_i - cal_i;
251
252 if (weight == weight && weight <= DBL_MAX) {
253 // If weight is not NaN.
254 sumrpnom += fabs(diff);
255 sumrpdenom += fabs(obs_i);
256
257 double tempnom = weight * diff * diff;
258 double tempden = weight * obs_i * obs_i;
259
260 sumnom += tempnom;
261 sumdenom += tempden;
262
263 if (tempnom != tempnom || tempden != tempden) {
264 logger.error() << "***** Error! ****** Data indexed " << i << " is NaN. "
265 << "i = " << i << ": cal = " << calI[i] << ", obs = " << obs_i << ", weight = " << weight
266 << ". \n";
267 }
268 }
269 }
270
271 Rfactor rfactor(0., 0.);
272 rfactor.Rp = (sumrpnom / sumrpdenom);
273 rfactor.Rwp = std::sqrt(sumnom / sumdenom);
274
275 if (rfactor.Rwp != rfactor.Rwp)
276 logger.debug() << "Rwp is NaN. Denominator = " << sumnom << "; Nominator = " << sumdenom << ". \n";
277
278 return rfactor;
279}
280
292template <typename TYPE>
293std::vector<double> getMomentsAboutOrigin(const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment) {
294 assertMomentIsValid(maxMoment);
295
296 // densities have the same number of x and y
297 bool isDensity(x.size() == y.size());
298
299 // if it isn't a density then check for histogram
300 if ((!isDensity) && (x.size() != y.size() + 1)) {
301 std::stringstream msg;
302 msg << "length of x (" << x.size() << ") and y (" << y.size() << ")do not match";
303 throw std::out_of_range(msg.str());
304 }
305
306 // initialize a result vector with all zeros
307 std::vector<double> result(std::size_t(maxMoment + 1), 0.);
308
309 // cache the maximum index
310 size_t numPoints = y.size();
311 if (isDensity)
312 numPoints = x.size() - 1;
313
314 // densities are calculated using Newton's method for numerical integration
315 // as backwards as it sounds, the outer loop should be the points rather
316 // than
317 // the moments
318 for (size_t j = 0; j < numPoints; ++j) {
319 // reduce item lookup - and central x for histogram
320 const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]);
321 // this variable will be (x^n)*y
322 auto temp = static_cast<double>(y[j]); // correct for histogram
323 if (isDensity) {
324 const auto xDelta = static_cast<double>(x[j + 1] - x[j]);
325 temp = .5 * (temp + static_cast<double>(y[j + 1])) * xDelta;
326 }
327
328 // accumulate the moments
329 result[0] += temp;
330 for (size_t i = 1; i < result.size(); ++i) {
331 temp *= xVal;
332 result[i] += temp;
333 }
334 }
335
336 return result;
337}
338
350template <typename TYPE>
351std::vector<double> getMomentsAboutMean(const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment) {
352 assertMomentIsValid(maxMoment);
353
354 // get the zeroth (integrated value) and first moment (mean)
355 std::vector<double> momentsAboutOrigin = getMomentsAboutOrigin(x, y, 1);
356 const double mean = momentsAboutOrigin[1];
357
358 // initialize a result vector with all zeros
359 std::vector<double> result(std::size_t(maxMoment + 1), 0.);
360 result[0] = momentsAboutOrigin[0];
361
362 // escape early if we need to
363 if (maxMoment == 0)
364 return result;
365
366 // densities have the same number of x and y
367 bool isDensity(x.size() == y.size());
368
369 // cache the maximum index
370 size_t numPoints = y.size();
371 if (isDensity)
372 numPoints = x.size() - 1;
373
374 // densities are calculated using Newton's method for numerical integration
375 // as backwards as it sounds, the outer loop should be the points rather
376 // than
377 // the moments
378 for (size_t j = 0; j < numPoints; ++j) {
379 // central x in histogram with a change of variables - and just change for
380 // density
381 const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]) - mean; // change of variables
382
383 // this variable will be (x^n)*y
384 double temp;
385 if (isDensity) {
386 const auto xDelta = static_cast<double>(x[j + 1] - x[j]);
387 temp = xVal * .5 * static_cast<double>(y[j] + y[j + 1]) * xDelta;
388 } else {
389 temp = xVal * static_cast<double>(y[j]);
390 }
391
392 // accumulate the moment
393 result[1] += temp;
394 for (size_t i = 2; i < result.size(); ++i) {
395 temp *= xVal;
396 result[i] += temp;
397 }
398 }
399
400 return result;
401}
402
403// -------------------------- Macro to instantiation concrete types
404// --------------------------------
405#define INSTANTIATE(TYPE) \
406 template MANTID_KERNEL_DLL Statistics getStatistics<TYPE>(const vector<TYPE> &, const unsigned int); \
407 template MANTID_KERNEL_DLL std::vector<double> getZscore<TYPE>(const vector<TYPE> &); \
408 template MANTID_KERNEL_DLL std::vector<double> getWeightedZscore<TYPE>(const vector<TYPE> &, const vector<TYPE> &); \
409 template MANTID_KERNEL_DLL std::vector<double> getModifiedZscore<TYPE>(const vector<TYPE> &); \
410 template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutOrigin<TYPE>( \
411 const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment); \
412 template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutMean<TYPE>( \
413 const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment);
414
415// --------------------------- Concrete instantiations
416// ---------------------------------------------
421INSTANTIATE(long long)
422INSTANTIATE(unsigned int)
423INSTANTIATE(unsigned long)
424INSTANTIATE(unsigned long long)
425
426} // namespace Mantid::Kernel
gsl_vector * tmp
double value
The value of the point.
Definition FitMW.cpp:51
double sigma
Definition GetAllEi.cpp:156
#define INSTANTIATE(TYPE)
#define fabs(x)
Definition Matrix.cpp:22
#define DLLExport
Definitions of the DLLImport compiler directives for MSVC.
Definition System.h:33
#define UNUSED_ARG(x)
Function arguments are sometimes unused in certain implmentations but are required for documentation ...
Definition System.h:44
std::vector< double > getModifiedZscore(const std::vector< TYPE > &data)
Return the modified Z score values for a dataset.
DLLExport Statistics getStatistics< string >(const vector< string > &data, const unsigned int flags)
Getting statistics of a string array should just give a bunch of NaNs.
Statistics getStatistics(const std::vector< TYPE > &data, const unsigned int flags=StatOptions::AllStats)
Return a statistics object for the given data set.
std::vector< double > getZscore(const std::vector< TYPE > &data)
Return the Z score values for a dataset.
std::vector< double > getMomentsAboutMean(const std::vector< TYPE > &x, const std::vector< TYPE > &y, const int maxMoment=3)
Return the first n-moments of the supplied data.
std::vector< double > getWeightedZscore(const std::vector< TYPE > &data, const std::vector< TYPE > &weights)
There are enough special cases in determining the Z score where it useful to put it in a single funct...
double getMedian(const vector< TYPE > &data)
There are enough special cases in determining the median where it useful to put it in a single functi...
DLLExport Statistics getStatistics< bool >(const vector< bool > &data, const unsigned int flags)
Getting statistics of a boolean array should just give a bunch of NaNs.
std::vector< double > getMomentsAboutOrigin(const std::vector< TYPE > &x, const std::vector< TYPE > &y, const int maxMoment=3)
Return the first n-moments of the supplied data.
Rfactor MANTID_KERNEL_DLL getRFactor(const std::vector< double > &obsI, const std::vector< double > &calI, const std::vector< double > &obsE)
Return the R-factors (Rwp) of a diffraction pattern data.
R factor for powder data analysis.
Definition Statistics.h:65
Simple struct to store statistics.
Definition Statistics.h:35
double mean
Mean value.
Definition Statistics.h:41
double median
Median value.
Definition Statistics.h:43
double minimum
Minimum value.
Definition Statistics.h:37
double maximum
Maximum value.
Definition Statistics.h:39
double standard_deviation
standard_deviation of the values
Definition Statistics.h:45
Statistics()
Default value for everything is nan.