Mantid
Loading...
Searching...
No Matches
Statistics.cpp
Go to the documentation of this file.
1// Mantid Repository : https://github.com/mantidproject/mantid
2//
3// Copyright © 2018 ISIS Rutherford Appleton Laboratory UKRI,
4// NScD Oak Ridge National Laboratory, European Spallation Source,
5// Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS
6// SPDX - License - Identifier: GPL - 3.0 +
7// Includes
10
11#include <boost/accumulators/accumulators.hpp>
12#include <boost/accumulators/statistics/max.hpp>
13#include <boost/accumulators/statistics/min.hpp>
14#include <boost/accumulators/statistics/stats.hpp>
15#include <boost/accumulators/statistics/variance.hpp>
16
17#include <algorithm>
18#include <cfloat>
19#include <cmath>
20#include <limits>
21#include <sstream>
22
23namespace Mantid::Kernel {
24namespace {
25Logger logger("Statistics");
26
27void assertMomentIsValid(const int maxMoment) {
28 if (maxMoment < 0) {
29 std::stringstream msg;
30 msg << "moment = " << maxMoment << " is statistically meaningless";
31 throw std::runtime_error(msg.str());
32 }
33}
34} // namespace
35
36using std::string;
37using std::vector;
38
40 constexpr double nan = std::numeric_limits<double>::quiet_NaN();
41 minimum = nan;
42 maximum = nan;
43 mean = nan;
44 median = nan;
46}
47
52template <typename TYPE> double getMedian(const vector<TYPE> &data) {
53 const size_t size = data.size();
54 if (size == 1)
55 return static_cast<double>(data[0]);
56
57 const bool isSorted = std::is_sorted(data.cbegin(), data.cend());
58 std::vector<TYPE> tmpSortedData;
59 auto sortedDataRef = std::ref(data);
60 if (!isSorted) {
61 tmpSortedData = data;
62 std::sort(tmpSortedData.begin(), tmpSortedData.end());
63 sortedDataRef = std::ref(std::as_const(tmpSortedData));
64 }
65
66 const bool is_even = (size % 2 == 0);
67 double retVal = 0;
68 if (is_even) {
69 const auto left = static_cast<double>(sortedDataRef.get()[size / 2 - 1]);
70 const auto right = static_cast<double>(sortedDataRef.get()[size / 2]);
71 retVal = (left + right) / 2;
72 } else {
73 retVal = static_cast<double>(sortedDataRef.get()[size / 2]);
74 }
75 return retVal;
76}
77
82template <typename TYPE> std::vector<double> getZscore(const vector<TYPE> &data) {
83 std::vector<double> Zscore;
84 if (data.size() < 3) {
85 Zscore.resize(data.size(), 0.);
86 return Zscore;
87 }
88 Statistics stats = getStatistics(data);
89 if (stats.standard_deviation == 0.) {
90 Zscore.resize(data.size(), 0.);
91 return Zscore;
92 }
93 for (auto it = data.cbegin(); it != data.cend(); ++it) {
94 auto tmp = static_cast<double>(*it);
95 // unclear why Zscore is non-negative, was first implemented in #5316
96 Zscore.emplace_back(fabs((stats.mean - tmp) / stats.standard_deviation));
97 }
98 return Zscore;
99}
104template <typename TYPE> std::vector<double> getWeightedZscore(const vector<TYPE> &data, const vector<TYPE> &weights) {
105 std::vector<double> Zscore;
106 if (data.size() < 3) {
107 Zscore.resize(data.size(), 0.);
108 return Zscore;
109 }
110 Statistics stats = getStatistics(data);
111 if (stats.standard_deviation == 0.) {
112 Zscore.resize(data.size(), 0.);
113 return Zscore;
114 }
115 double sumWeights = 0.0;
116 double sumWeightedData = 0.0;
117 double weightedVariance = 0.0;
118 for (size_t it = 0; it != data.size(); ++it) {
119 sumWeights += static_cast<double>(weights[it]);
120 sumWeightedData += static_cast<double>(weights[it] * data[it]);
121 }
122 double weightedMean = sumWeightedData / sumWeights;
123 for (size_t it = 0; it != data.size(); ++it) {
124 weightedVariance += std::pow(static_cast<double>(data[it]) - weightedMean, 2) *
125 std::pow(static_cast<double>(weights[it]) / sumWeights, 2);
126 }
127 for (auto it = data.cbegin(); it != data.cend(); ++it) {
128 Zscore.emplace_back(fabs((static_cast<double>(*it) - weightedMean) / std::sqrt(weightedVariance)));
129 }
130 return Zscore;
131}
137template <typename TYPE> std::vector<double> getModifiedZscore(const vector<TYPE> &data) {
138 if (data.size() < 3) {
139 std::vector<double> Zscore(data.size(), 0.);
140 return Zscore;
141 }
142 std::vector<double> MADvec;
143 double tmp;
144 double median = getMedian(data);
145 for (auto it = data.cbegin(); it != data.cend(); ++it) {
146 tmp = static_cast<double>(*it);
147 MADvec.emplace_back(fabs(tmp - median));
148 }
149 double MAD = getMedian(MADvec);
150 if (MAD == 0.) {
151 std::vector<double> Zscore(data.size(), 0.);
152 return Zscore;
153 }
154 MADvec.clear();
155 std::vector<double> Zscore;
156 for (auto it = data.begin(); it != data.end(); ++it) {
157 tmp = static_cast<double>(*it);
158 Zscore.emplace_back(0.6745 * fabs((tmp - median) / MAD));
159 }
160 return Zscore;
161}
162
169template <typename TYPE> Statistics getStatistics(const vector<TYPE> &data, const unsigned int flags) {
170 Statistics statistics;
171 if (data.empty()) { // don't do anything
172 return statistics;
173 }
174 // calculate the mean if this or the stddev is requested
175 const bool stddev = ((flags & StatOptions::UncorrectedStdDev) || (flags & StatOptions::CorrectedStdDev));
176 if (stddev) {
177 using namespace boost::accumulators;
178 accumulator_set<double, stats<tag::min, tag::max, tag::variance>> acc;
179 for (auto &value : data) {
180 acc(static_cast<double>(value));
181 }
182 statistics.minimum = min(acc);
183 statistics.maximum = max(acc);
184 statistics.mean = mean(acc);
185 double var = variance(acc);
186
187 if (flags & StatOptions::CorrectedStdDev) {
188 auto ndofs = static_cast<double>(data.size());
189 var *= ndofs / (ndofs - 1.0);
190 }
191 statistics.standard_deviation = std::sqrt(var);
192
193 } else if (flags & StatOptions::Mean) {
194 using namespace boost::accumulators;
195 accumulator_set<double, stats<tag::mean>> acc;
196 for (auto &value : data) {
197 acc(static_cast<double>(value));
198 }
199 statistics.mean = mean(acc);
200 }
201
202 // calculate the median if requested
203 if (flags & StatOptions::Median) {
204 statistics.median = getMedian(data);
205 }
206
207 return statistics;
208}
209
211template <> DLLExport Statistics getStatistics<string>(const vector<string> &data, const unsigned int flags) {
212 UNUSED_ARG(flags);
213 UNUSED_ARG(data);
214 return Statistics(); // default is all nan
215}
216
218template <> DLLExport Statistics getStatistics<bool>(const vector<bool> &data, const unsigned int flags) {
219 UNUSED_ARG(flags);
220 UNUSED_ARG(data);
221 return Statistics(); // default is all nan
222}
223
231Rfactor getRFactor(const std::vector<double> &obsI, const std::vector<double> &calI, const std::vector<double> &obsE) {
232 // 1. Check
233 if (obsI.size() != calI.size() || obsI.size() != obsE.size()) {
234 std::stringstream errss;
235 errss << "GetRFactor() Input Error! Observed Intensity (" << obsI.size() << "), Calculated Intensity ("
236 << calI.size() << ") and Observed Error (" << obsE.size() << ") have different number of elements.";
237 throw std::runtime_error(errss.str());
238 }
239 if (obsI.empty()) {
240 throw std::runtime_error("getRFactor(): the input arrays are empty.");
241 }
242
243 double sumnom = 0;
244 double sumdenom = 0;
245 double sumrpnom = 0;
246 double sumrpdenom = 0;
247
248 size_t numpts = obsI.size();
249 for (size_t i = 0; i < numpts; ++i) {
250 double cal_i = calI[i];
251 double obs_i = obsI[i];
252 double sigma = obsE[i];
253 double weight = 1.0 / (sigma * sigma);
254 double diff = obs_i - cal_i;
255
256 if (weight == weight && weight <= DBL_MAX) {
257 // If weight is not NaN.
258 sumrpnom += fabs(diff);
259 sumrpdenom += fabs(obs_i);
260
261 double tempnom = weight * diff * diff;
262 double tempden = weight * obs_i * obs_i;
263
264 sumnom += tempnom;
265 sumdenom += tempden;
266
267 if (tempnom != tempnom || tempden != tempden) {
268 logger.error() << "***** Error! ****** Data indexed " << i << " is NaN. "
269 << "i = " << i << ": cal = " << calI[i] << ", obs = " << obs_i << ", weight = " << weight
270 << ". \n";
271 }
272 }
273 }
274
275 Rfactor rfactor(0., 0.);
276 rfactor.Rp = (sumrpnom / sumrpdenom);
277 rfactor.Rwp = std::sqrt(sumnom / sumdenom);
278
279 if (rfactor.Rwp != rfactor.Rwp)
280 logger.debug() << "Rwp is NaN. Denominator = " << sumnom << "; Nominator = " << sumdenom << ". \n";
281
282 return rfactor;
283}
284
296template <typename TYPE>
297std::vector<double> getMomentsAboutOrigin(const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment) {
298 assertMomentIsValid(maxMoment);
299
300 // densities have the same number of x and y
301 bool isDensity(x.size() == y.size());
302
303 // if it isn't a density then check for histogram
304 if ((!isDensity) && (x.size() != y.size() + 1)) {
305 std::stringstream msg;
306 msg << "length of x (" << x.size() << ") and y (" << y.size() << ")do not match";
307 throw std::out_of_range(msg.str());
308 }
309
310 // initialize a result vector with all zeros
311 std::vector<double> result(std::size_t(maxMoment + 1), 0.);
312
313 // cache the maximum index
314 size_t numPoints = y.size();
315 if (isDensity)
316 numPoints = x.size() - 1;
317
318 // densities are calculated using Newton's method for numerical integration
319 // as backwards as it sounds, the outer loop should be the points rather
320 // than
321 // the moments
322 for (size_t j = 0; j < numPoints; ++j) {
323 // reduce item lookup - and central x for histogram
324 const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]);
325 // this variable will be (x^n)*y
326 auto temp = static_cast<double>(y[j]); // correct for histogram
327 if (isDensity) {
328 const auto xDelta = static_cast<double>(x[j + 1] - x[j]);
329 temp = .5 * (temp + static_cast<double>(y[j + 1])) * xDelta;
330 }
331
332 // accumulate the moments
333 result[0] += temp;
334 for (size_t i = 1; i < result.size(); ++i) {
335 temp *= xVal;
336 result[i] += temp;
337 }
338 }
339
340 return result;
341}
342
354template <typename TYPE>
355std::vector<double> getMomentsAboutMean(const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment) {
356 assertMomentIsValid(maxMoment);
357
358 // get the zeroth (integrated value) and first moment (mean)
359 std::vector<double> momentsAboutOrigin = getMomentsAboutOrigin(x, y, 1);
360 const double mean = momentsAboutOrigin[1];
361
362 // initialize a result vector with all zeros
363 std::vector<double> result(std::size_t(maxMoment + 1), 0.);
364 result[0] = momentsAboutOrigin[0];
365
366 // escape early if we need to
367 if (maxMoment == 0)
368 return result;
369
370 // densities have the same number of x and y
371 bool isDensity(x.size() == y.size());
372
373 // cache the maximum index
374 size_t numPoints = y.size();
375 if (isDensity)
376 numPoints = x.size() - 1;
377
378 // densities are calculated using Newton's method for numerical integration
379 // as backwards as it sounds, the outer loop should be the points rather
380 // than
381 // the moments
382 for (size_t j = 0; j < numPoints; ++j) {
383 // central x in histogram with a change of variables - and just change for
384 // density
385 const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]) - mean; // change of variables
386
387 // this variable will be (x^n)*y
388 double temp;
389 if (isDensity) {
390 const auto xDelta = static_cast<double>(x[j + 1] - x[j]);
391 temp = xVal * .5 * static_cast<double>(y[j] + y[j + 1]) * xDelta;
392 } else {
393 temp = xVal * static_cast<double>(y[j]);
394 }
395
396 // accumulate the moment
397 result[1] += temp;
398 for (size_t i = 2; i < result.size(); ++i) {
399 temp *= xVal;
400 result[i] += temp;
401 }
402 }
403
404 return result;
405}
406
407// -------------------------- Macro to instantiation concrete types
408// --------------------------------
409#define INSTANTIATE(TYPE) \
410 template MANTID_KERNEL_DLL Statistics getStatistics<TYPE>(const vector<TYPE> &, const unsigned int); \
411 template MANTID_KERNEL_DLL std::vector<double> getZscore<TYPE>(const vector<TYPE> &); \
412 template MANTID_KERNEL_DLL std::vector<double> getWeightedZscore<TYPE>(const vector<TYPE> &, const vector<TYPE> &); \
413 template MANTID_KERNEL_DLL std::vector<double> getModifiedZscore<TYPE>(const vector<TYPE> &); \
414 template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutOrigin<TYPE>( \
415 const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment); \
416 template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutMean<TYPE>( \
417 const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment);
418
419// --------------------------- Concrete instantiations
420// ---------------------------------------------
425INSTANTIATE(long long)
426INSTANTIATE(unsigned int)
427INSTANTIATE(unsigned long)
428INSTANTIATE(unsigned long long)
429
430} // namespace Mantid::Kernel
gsl_vector * tmp
double value
The value of the point.
Definition FitMW.cpp:51
double sigma
Definition GetAllEi.cpp:156
#define INSTANTIATE(TYPE)
double left
double right
#define fabs(x)
Definition Matrix.cpp:22
#define DLLExport
Definitions of the DLLImport compiler directives for MSVC.
Definition System.h:37
#define UNUSED_ARG(x)
Function arguments are sometimes unused in certain implmentations but are required for documentation ...
Definition System.h:48
std::vector< double > getModifiedZscore(const std::vector< TYPE > &data)
Return the modified Z score values for a dataset.
DLLExport Statistics getStatistics< string >(const vector< string > &data, const unsigned int flags)
Getting statistics of a string array should just give a bunch of NaNs.
Statistics getStatistics(const std::vector< TYPE > &data, const unsigned int flags=StatOptions::AllStats)
Return a statistics object for the given data set.
std::vector< double > getZscore(const std::vector< TYPE > &data)
Return the Z score values for a dataset.
std::vector< double > getMomentsAboutMean(const std::vector< TYPE > &x, const std::vector< TYPE > &y, const int maxMoment=3)
Return the first n-moments of the supplied data.
std::vector< double > getWeightedZscore(const std::vector< TYPE > &data, const std::vector< TYPE > &weights)
There are enough special cases in determining the Z score where it useful to put it in a single funct...
double getMedian(const vector< TYPE > &data)
There are enough special cases in determining the median where it useful to put it in a single functi...
DLLExport Statistics getStatistics< bool >(const vector< bool > &data, const unsigned int flags)
Getting statistics of a boolean array should just give a bunch of NaNs.
std::vector< double > getMomentsAboutOrigin(const std::vector< TYPE > &x, const std::vector< TYPE > &y, const int maxMoment=3)
Return the first n-moments of the supplied data.
Rfactor MANTID_KERNEL_DLL getRFactor(const std::vector< double > &obsI, const std::vector< double > &calI, const std::vector< double > &obsE)
Return the R-factors (Rwp) of a diffraction pattern data.
R factor for powder data analysis.
Definition Statistics.h:65
Simple struct to store statistics.
Definition Statistics.h:35
double mean
Mean value.
Definition Statistics.h:41
double median
Median value.
Definition Statistics.h:43
double minimum
Minimum value.
Definition Statistics.h:37
double maximum
Maximum value.
Definition Statistics.h:39
double standard_deviation
standard_deviation of the values
Definition Statistics.h:45
Statistics()
Default value for everything is nan.