Mantid
Loading...
Searching...
No Matches
Statistics.cpp
Go to the documentation of this file.
1// Mantid Repository : https://github.com/mantidproject/mantid
2//
3// Copyright © 2018 ISIS Rutherford Appleton Laboratory UKRI,
4// NScD Oak Ridge National Laboratory, European Spallation Source,
5// Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS
6// SPDX - License - Identifier: GPL - 3.0 +
7// Includes
10
11#include <boost/accumulators/accumulators.hpp>
12#include <boost/accumulators/statistics/max.hpp>
13#include <boost/accumulators/statistics/min.hpp>
14#include <boost/accumulators/statistics/stats.hpp>
15#include <boost/accumulators/statistics/variance.hpp>
16
17#include <algorithm>
18#include <cfloat>
19#include <cmath>
20#include <sstream>
21
22namespace Mantid::Kernel {
23namespace {
24Logger logger("Statistics");
25}
26
27using std::string;
28using std::vector;
29
35 double nan = std::numeric_limits<double>::quiet_NaN();
36
37 Statistics stats;
38 stats.minimum = nan;
39 stats.maximum = nan;
40 stats.mean = nan;
41 stats.median = nan;
42 stats.standard_deviation = nan;
43
44 return stats;
45}
46
51template <typename TYPE> double getMedian(const vector<TYPE> &data) {
52 const size_t size = data.size();
53 if (size == 1)
54 return static_cast<double>(data[0]);
55
56 const bool isSorted = std::is_sorted(data.cbegin(), data.cend());
57 std::vector<TYPE> tmpSortedData;
58 auto sortedDataRef = std::ref(data);
59 if (!isSorted) {
60 tmpSortedData = data;
61 std::sort(tmpSortedData.begin(), tmpSortedData.end());
62 sortedDataRef = std::ref(std::as_const(tmpSortedData));
63 }
64
65 const bool is_even = (size % 2 == 0);
66 double retVal = 0;
67 if (is_even) {
68 const auto left = static_cast<double>(sortedDataRef.get()[size / 2 - 1]);
69 const auto right = static_cast<double>(sortedDataRef.get()[size / 2]);
70 retVal = (left + right) / 2;
71 } else {
72 retVal = static_cast<double>(sortedDataRef.get()[size / 2]);
73 }
74 return retVal;
75}
76
81template <typename TYPE> std::vector<double> getZscore(const vector<TYPE> &data) {
82 std::vector<double> Zscore;
83 if (data.size() < 3) {
84 Zscore.resize(data.size(), 0.);
85 return Zscore;
86 }
87 Statistics stats = getStatistics(data);
88 if (stats.standard_deviation == 0.) {
89 Zscore.resize(data.size(), 0.);
90 return Zscore;
91 }
92 for (auto it = data.cbegin(); it != data.cend(); ++it) {
93 auto tmp = static_cast<double>(*it);
94 Zscore.emplace_back(fabs((stats.mean - tmp) / stats.standard_deviation));
95 }
96 return Zscore;
97}
102template <typename TYPE> std::vector<double> getWeightedZscore(const vector<TYPE> &data, const vector<TYPE> &weights) {
103 std::vector<double> Zscore;
104 if (data.size() < 3) {
105 Zscore.resize(data.size(), 0.);
106 return Zscore;
107 }
108 Statistics stats = getStatistics(data);
109 if (stats.standard_deviation == 0.) {
110 Zscore.resize(data.size(), 0.);
111 return Zscore;
112 }
113 double sumWeights = 0.0;
114 double sumWeightedData = 0.0;
115 double weightedVariance = 0.0;
116 for (size_t it = 0; it != data.size(); ++it) {
117 sumWeights += static_cast<double>(weights[it]);
118 sumWeightedData += static_cast<double>(weights[it] * data[it]);
119 }
120 double weightedMean = sumWeightedData / sumWeights;
121 for (size_t it = 0; it != data.size(); ++it) {
122 weightedVariance += std::pow(static_cast<double>(data[it]) - weightedMean, 2) *
123 std::pow(static_cast<double>(weights[it]) / sumWeights, 2);
124 }
125 for (auto it = data.cbegin(); it != data.cend(); ++it) {
126 Zscore.emplace_back(fabs((static_cast<double>(*it) - weightedMean) / std::sqrt(weightedVariance)));
127 }
128 return Zscore;
129}
135template <typename TYPE> std::vector<double> getModifiedZscore(const vector<TYPE> &data) {
136 if (data.size() < 3) {
137 std::vector<double> Zscore(data.size(), 0.);
138 return Zscore;
139 }
140 std::vector<double> MADvec;
141 double tmp;
142 double median = getMedian(data);
143 for (auto it = data.cbegin(); it != data.cend(); ++it) {
144 tmp = static_cast<double>(*it);
145 MADvec.emplace_back(fabs(tmp - median));
146 }
147 double MAD = getMedian(MADvec);
148 if (MAD == 0.) {
149 std::vector<double> Zscore(data.size(), 0.);
150 return Zscore;
151 }
152 MADvec.clear();
153 std::vector<double> Zscore;
154 for (auto it = data.begin(); it != data.end(); ++it) {
155 tmp = static_cast<double>(*it);
156 Zscore.emplace_back(0.6745 * fabs((tmp - median) / MAD));
157 }
158 return Zscore;
159}
160
167template <typename TYPE> Statistics getStatistics(const vector<TYPE> &data, const unsigned int flags) {
168 Statistics statistics = getNanStatistics();
169 if (data.empty()) { // don't do anything
170 return statistics;
171 }
172 // calculate the mean if this or the stddev is requested
173 const bool stddev = ((flags & StatOptions::UncorrectedStdDev) || (flags & StatOptions::CorrectedStdDev));
174 if (stddev) {
175 using namespace boost::accumulators;
176 accumulator_set<double, stats<tag::min, tag::max, tag::variance>> acc;
177 for (auto &value : data) {
178 acc(static_cast<double>(value));
179 }
180 statistics.minimum = min(acc);
181 statistics.maximum = max(acc);
182 statistics.mean = mean(acc);
183 double var = variance(acc);
184
185 if (flags & StatOptions::CorrectedStdDev) {
186 auto ndofs = static_cast<double>(data.size());
187 var *= ndofs / (ndofs - 1.0);
188 }
189 statistics.standard_deviation = std::sqrt(var);
190
191 } else if (flags & StatOptions::Mean) {
192 using namespace boost::accumulators;
193 accumulator_set<double, stats<tag::mean>> acc;
194 for (auto &value : data) {
195 acc(static_cast<double>(value));
196 }
197 statistics.mean = mean(acc);
198 }
199
200 // calculate the median if requested
201 if (flags & StatOptions::Median) {
202 statistics.median = getMedian(data);
203 }
204
205 return statistics;
206}
207
209template <> DLLExport Statistics getStatistics<string>(const vector<string> &data, const unsigned int flags) {
210 UNUSED_ARG(flags);
211 UNUSED_ARG(data);
212 return getNanStatistics();
213}
214
216template <> DLLExport Statistics getStatistics<bool>(const vector<bool> &data, const unsigned int flags) {
217 UNUSED_ARG(flags);
218 UNUSED_ARG(data);
219 return getNanStatistics();
220}
221
229Rfactor getRFactor(const std::vector<double> &obsI, const std::vector<double> &calI, const std::vector<double> &obsE) {
230 // 1. Check
231 if (obsI.size() != calI.size() || obsI.size() != obsE.size()) {
232 std::stringstream errss;
233 errss << "GetRFactor() Input Error! Observed Intensity (" << obsI.size() << "), Calculated Intensity ("
234 << calI.size() << ") and Observed Error (" << obsE.size() << ") have different number of elements.";
235 throw std::runtime_error(errss.str());
236 }
237 if (obsI.empty()) {
238 throw std::runtime_error("getRFactor(): the input arrays are empty.");
239 }
240
241 double sumnom = 0;
242 double sumdenom = 0;
243 double sumrpnom = 0;
244 double sumrpdenom = 0;
245
246 size_t numpts = obsI.size();
247 for (size_t i = 0; i < numpts; ++i) {
248 double cal_i = calI[i];
249 double obs_i = obsI[i];
250 double sigma = obsE[i];
251 double weight = 1.0 / (sigma * sigma);
252 double diff = obs_i - cal_i;
253
254 if (weight == weight && weight <= DBL_MAX) {
255 // If weight is not NaN.
256 sumrpnom += fabs(diff);
257 sumrpdenom += fabs(obs_i);
258
259 double tempnom = weight * diff * diff;
260 double tempden = weight * obs_i * obs_i;
261
262 sumnom += tempnom;
263 sumdenom += tempden;
264
265 if (tempnom != tempnom || tempden != tempden) {
266 logger.error() << "***** Error! ****** Data indexed " << i << " is NaN. "
267 << "i = " << i << ": cal = " << calI[i] << ", obs = " << obs_i << ", weight = " << weight
268 << ". \n";
269 }
270 }
271 }
272
273 Rfactor rfactor(0., 0.);
274 rfactor.Rp = (sumrpnom / sumrpdenom);
275 rfactor.Rwp = std::sqrt(sumnom / sumdenom);
276
277 if (rfactor.Rwp != rfactor.Rwp)
278 logger.debug() << "Rwp is NaN. Denominator = " << sumnom << "; Nominator = " << sumdenom << ". \n";
279
280 return rfactor;
281}
282
294template <typename TYPE>
295std::vector<double> getMomentsAboutOrigin(const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment) {
296 // densities have the same number of x and y
297 bool isDensity(x.size() == y.size());
298
299 // if it isn't a density then check for histogram
300 if ((!isDensity) && (x.size() != y.size() + 1)) {
301 std::stringstream msg;
302 msg << "length of x (" << x.size() << ") and y (" << y.size() << ")do not match";
303 throw std::out_of_range(msg.str());
304 }
305
306 // initialize a result vector with all zeros
307 std::vector<double> result(maxMoment + 1, 0.);
308
309 // cache the maximum index
310 size_t numPoints = y.size();
311 if (isDensity)
312 numPoints = x.size() - 1;
313
314 // densities are calculated using Newton's method for numerical integration
315 // as backwards as it sounds, the outer loop should be the points rather
316 // than
317 // the moments
318 for (size_t j = 0; j < numPoints; ++j) {
319 // reduce item lookup - and central x for histogram
320 const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]);
321 // this variable will be (x^n)*y
322 auto temp = static_cast<double>(y[j]); // correct for histogram
323 if (isDensity) {
324 const auto xDelta = static_cast<double>(x[j + 1] - x[j]);
325 temp = .5 * (temp + static_cast<double>(y[j + 1])) * xDelta;
326 }
327
328 // accumulate the moments
329 result[0] += temp;
330 for (size_t i = 1; i < result.size(); ++i) {
331 temp *= xVal;
332 result[i] += temp;
333 }
334 }
335
336 return result;
337}
338
350template <typename TYPE>
351std::vector<double> getMomentsAboutMean(const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment) {
352 // get the zeroth (integrated value) and first moment (mean)
353 std::vector<double> momentsAboutOrigin = getMomentsAboutOrigin(x, y, 1);
354 const double mean = momentsAboutOrigin[1];
355
356 // initialize a result vector with all zeros
357 std::vector<double> result(maxMoment + 1, 0.);
358 result[0] = momentsAboutOrigin[0];
359
360 // escape early if we need to
361 if (maxMoment == 0)
362 return result;
363
364 // densities have the same number of x and y
365 bool isDensity(x.size() == y.size());
366
367 // cache the maximum index
368 size_t numPoints = y.size();
369 if (isDensity)
370 numPoints = x.size() - 1;
371
372 // densities are calculated using Newton's method for numerical integration
373 // as backwards as it sounds, the outer loop should be the points rather
374 // than
375 // the moments
376 for (size_t j = 0; j < numPoints; ++j) {
377 // central x in histogram with a change of variables - and just change for
378 // density
379 const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]) - mean; // change of variables
380
381 // this variable will be (x^n)*y
382 double temp;
383 if (isDensity) {
384 const auto xDelta = static_cast<double>(x[j + 1] - x[j]);
385 temp = xVal * .5 * static_cast<double>(y[j] + y[j + 1]) * xDelta;
386 } else {
387 temp = xVal * static_cast<double>(y[j]);
388 }
389
390 // accumulate the moment
391 result[1] += temp;
392 for (size_t i = 2; i < result.size(); ++i) {
393 temp *= xVal;
394 result[i] += temp;
395 }
396 }
397
398 return result;
399}
400
401// -------------------------- Macro to instantiation concrete types
402// --------------------------------
403#define INSTANTIATE(TYPE) \
404 template MANTID_KERNEL_DLL Statistics getStatistics<TYPE>(const vector<TYPE> &, const unsigned int); \
405 template MANTID_KERNEL_DLL std::vector<double> getZscore<TYPE>(const vector<TYPE> &); \
406 template MANTID_KERNEL_DLL std::vector<double> getWeightedZscore<TYPE>(const vector<TYPE> &, const vector<TYPE> &); \
407 template MANTID_KERNEL_DLL std::vector<double> getModifiedZscore<TYPE>(const vector<TYPE> &); \
408 template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutOrigin<TYPE>( \
409 const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment); \
410 template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutMean<TYPE>( \
411 const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment);
412
413// --------------------------- Concrete instantiations
414// ---------------------------------------------
419INSTANTIATE(long long)
420INSTANTIATE(unsigned int)
421INSTANTIATE(unsigned long)
422INSTANTIATE(unsigned long long)
423
424} // namespace Mantid::Kernel
gsl_vector * tmp
double value
The value of the point.
Definition: FitMW.cpp:51
double sigma
Definition: GetAllEi.cpp:156
#define INSTANTIATE(TYPE)
Definition: Statistics.cpp:403
double left
Definition: LineProfile.cpp:80
double right
Definition: LineProfile.cpp:81
#define fabs(x)
Definition: Matrix.cpp:22
#define DLLExport
Definitions of the DLLImport compiler directives for MSVC.
Definition: System.h:53
#define UNUSED_ARG(x)
Function arguments are sometimes unused in certain implmentations but are required for documentation ...
Definition: System.h:64
std::vector< double > getModifiedZscore(const std::vector< TYPE > &data)
Return the modified Z score values for a dataset.
Definition: Statistics.cpp:135
DLLExport Statistics getStatistics< string >(const vector< string > &data, const unsigned int flags)
Getting statistics of a string array should just give a bunch of NaNs.
Definition: Statistics.cpp:209
Statistics getStatistics(const std::vector< TYPE > &data, const unsigned int flags=StatOptions::AllStats)
Return a statistics object for the given data set.
Definition: Statistics.cpp:167
std::vector< double > getZscore(const std::vector< TYPE > &data)
Return the Z score values for a dataset.
Definition: Statistics.cpp:81
std::vector< double > getMomentsAboutMean(const std::vector< TYPE > &x, const std::vector< TYPE > &y, const int maxMoment=3)
Return the first n-moments of the supplied data.
Definition: Statistics.cpp:351
std::vector< double > getWeightedZscore(const std::vector< TYPE > &data, const std::vector< TYPE > &weights)
There are enough special cases in determining the Z score where it useful to put it in a single funct...
Definition: Statistics.cpp:102
double getMedian(const vector< TYPE > &data)
There are enough special cases in determining the median where it useful to put it in a single functi...
Definition: Statistics.cpp:51
DLLExport Statistics getStatistics< bool >(const vector< bool > &data, const unsigned int flags)
Getting statistics of a boolean array should just give a bunch of NaNs.
Definition: Statistics.cpp:216
std::vector< double > getMomentsAboutOrigin(const std::vector< TYPE > &x, const std::vector< TYPE > &y, const int maxMoment=3)
Return the first n-moments of the supplied data.
Definition: Statistics.cpp:295
Statistics getNanStatistics()
Generate a Statistics object where all of the values are NaN.
Definition: Statistics.cpp:34
Rfactor MANTID_KERNEL_DLL getRFactor(const std::vector< double > &obsI, const std::vector< double > &calI, const std::vector< double > &obsE)
Return the R-factors (Rwp) of a diffraction pattern data.
Definition: Statistics.cpp:229
R factor for powder data analysis.
Definition: Statistics.h:52
Simple struct to store statistics.
Definition: Statistics.h:25
double mean
Mean value.
Definition: Statistics.h:31
double median
Median value.
Definition: Statistics.h:33
double minimum
Minimum value.
Definition: Statistics.h:27
double maximum
Maximum value.
Definition: Statistics.h:29
double standard_deviation
standard_deviation of the values
Definition: Statistics.h:35