Mantid
Loading...
Searching...
No Matches
CompressEventAccumulator.cpp
Go to the documentation of this file.
1// Mantid Repository : https://github.com/mantidproject/mantid
2//
3// Copyright © 2024 ISIS Rutherford Appleton Laboratory UKRI,
4// NScD Oak Ridge National Laboratory, European Spallation Source,
5// Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS
6// SPDX - License - Identifier: GPL - 3.0 +
7
10
11#include <algorithm>
12#include <tbb/parallel_sort.h>
13
15
16namespace Mantid {
17namespace DataHandling {
18CompressEventAccumulator::CompressEventAccumulator(std::shared_ptr<std::vector<double>> histogram_bin_edges,
19 const double divisor, CompressBinningMode bin_mode)
20 : m_histogram_edges(std::move(histogram_bin_edges)), m_initialized(false) {
21 const auto tof_min = static_cast<double>(m_histogram_edges->front());
22
23 // setup function pointer and parameters for finding bins
24 // look in EventList for why
25 if (bin_mode == CompressBinningMode::LINEAR) {
27 m_divisor = 1. / abs(divisor);
28 m_offset = tof_min * m_divisor;
29 } else if (bin_mode == CompressBinningMode::LOGARITHMIC) {
31 m_divisor = 1. / log1p(abs(divisor)); // use this to do change of base
32 m_offset = log(tof_min) * m_divisor;
33 } else {
34 throw std::runtime_error("Haven't implemented this compression binning strategy");
35 }
36}
37
38std::size_t CompressEventAccumulator::numberHistBins() const { return m_histogram_edges->size(); }
39
40template <typename INT_TYPE> double CompressEventAccumulator::getBinCenter(const INT_TYPE bin) const {
41 // work with iterators to reduce access time
42 const auto &binIter = m_histogram_edges->cbegin() + static_cast<std::vector<double>::difference_type>(bin);
43 return 0.5 * ((*binIter) + *(std::next(binIter)));
44}
45
46std::optional<size_t> CompressEventAccumulator::findBin(const float tof) const {
47 // last parameter being false means don't find exact bin for raw events coming out of the file
48 return m_findBin(*m_histogram_edges.get(), static_cast<double>(tof), m_divisor, m_offset, false);
49}
50
51// ------------------------------------------------------------------------
52namespace { // anonymous
53
54// minimum size of a vector to bother doing parallel_sort
55// the grain size shouldn't get too small or the overhead of threading/sync overwhelms everything else
56// TODO this can be tuned
57constexpr size_t MIN_VEC_LENGTH_PARALLEL_SORT{5000};
58
59// blindly assume 10 raw events are compressed to a single weighed event on average
60constexpr size_t EXP_COMRESS_RATIO{10};
61
65class CompressSparseFloat : public CompressEventAccumulator {
66public:
67 // pass all arguments to the parent
68 CompressSparseFloat(std::shared_ptr<std::vector<double>> histogram_bin_edges, const size_t numEvents,
69 const double divisor, CompressBinningMode bin_mode)
70 : CompressEventAccumulator(histogram_bin_edges, divisor, bin_mode), m_is_sorted(false) {
71 m_tof.reserve(numEvents);
72 m_initialized = true;
73 }
74
75 double totalWeight() const override { return static_cast<double>(m_tof.size()); }
76
80 void addEvent(const float tof) override {
81 // add events to the end of the list
82 m_tof.push_back(tof);
83 }
84
85private:
86 // this is marked because it is mostly used by createWeightedEvents and m_tof is mutable
87 void sort() const {
88 if (m_is_sorted)
89 return;
90
91 // empty events and singles are already sorted
92 if (m_tof.size() < 2) {
93 m_is_sorted = true;
94 return;
95 }
96
97 // calculate smallest tolerance
98 const auto &iter = m_histogram_edges->cbegin();
99 const auto delta = 1. / static_cast<float>((*(iter + 1)) - (*iter));
100
101 // parallel sort only if this is big enough of a vector
102 if (m_tof.size() < MIN_VEC_LENGTH_PARALLEL_SORT)
103 std::sort(m_tof.begin(), m_tof.end(), [delta](const auto &left, const auto &right) {
104 return std::floor(left * delta) < std::floor(right * delta);
105 });
106 else
107 tbb::parallel_sort(m_tof.begin(), m_tof.end(), [delta](const auto &left, const auto &right) {
108 return std::floor(left * delta) < std::floor(right * delta);
109 });
110 m_is_sorted = true;
111 }
112
113public:
114 void createWeightedEvents(std::vector<Mantid::DataObjects::WeightedEventNoTime> *raw_events) const override {
115 raw_events->clear(); // clean out previous version
116
117 if (m_tof.empty()) {
118 m_is_sorted = true;
119 return;
120 } else if (m_tof.size() == 1) {
121 // don't bother with temporary objects and such if there is only one event
122 m_is_sorted = true;
123 raw_events->emplace_back(m_tof.front(), 1., 1.);
124 return;
125 } else {
126 // code below assumes the tofs are sorted
127 this->sort();
128
129 // blindly assume ratio of raw events which are compressed to a single weighed event on average
130 raw_events->reserve(m_tof.size() / EXP_COMRESS_RATIO);
131
132 // this finds the bin for the current event, then moves the iterator until an event that is out of range is
133 // encountered at that point, an event is added to the output and the iterators are moved
134 const auto optional_bin_first = this->findBin(m_tof.front());
135 size_t lastBin = optional_bin_first.value();
136 double nextTof = m_histogram_edges->at(lastBin + 1);
137 uint32_t counts = 0;
138 for (const auto &tof : m_tof) {
139 if (lastBin + 1 >= m_histogram_edges->size()) {
140 break; // this should never happen
141 }
142
143 if (tof < nextTof) {
144 // accumulate information
145 counts++;
146 } else {
147 // add weighted event
148 const auto evtof = this->getBinCenter(lastBin);
149 const auto counts_f = static_cast<float>(counts);
150 raw_events->emplace_back(evtof, counts_f, counts_f);
151 // reset accumulators to include this new value
152 counts = 1;
153 // total_tof = tof;
154
155 // advance the last bin value
156 if (static_cast<double>(tof) < m_histogram_edges->operator[](lastBin + 2)) {
157 // next bin contains the tof
158 lastBin += 1;
159 } else {
160 // find the bin to use
161 const auto optional_bin = this->findBin(tof);
162 lastBin = optional_bin.value();
163 }
164 nextTof = m_histogram_edges->operator[](lastBin + 1);
165 }
166 }
167
168 // add in the last one if it is there
169 if (counts > 0) {
170 const auto evtof = this->getBinCenter(lastBin);
171 const auto counts_f = static_cast<float>(counts);
172 raw_events->emplace_back(evtof, counts_f, counts_f);
173 }
174 }
175 }
176
177 DataObjects::EventSortType getSortType() const override {
178 if (m_is_sorted)
180 else
182 }
183
184private:
186 mutable std::vector<float> m_tof;
187 mutable bool m_is_sorted;
188};
189
193class CompressSparseInt : public CompressEventAccumulator {
194public:
195 // pass all arguments to the parent
196 CompressSparseInt(std::shared_ptr<std::vector<double>> histogram_bin_edges, const size_t numEvents,
197 const double divisor, CompressBinningMode bin_mode)
198 : CompressEventAccumulator(histogram_bin_edges, divisor, bin_mode), m_is_sorted(false) {
199 m_tof_bin.reserve(numEvents); // TODO should be based on number of predicted events
200 m_initialized = true;
201 }
202
203 double totalWeight() const override { return static_cast<double>(m_tof_bin.size()); }
204
208 void addEvent(const float tof) override {
209 // add events
210 const auto &bin_optional = this->findBin(tof);
211 if (bin_optional) {
212 m_tof_bin.push_back(static_cast<uint32_t>(bin_optional.value()));
213 }
214 }
215
216 // this is marked because it is mostly used by createWeightedEvents and m_tof is mutable
217 void sort() const {
218 if (m_is_sorted)
219 return;
220 if (m_tof_bin.size() < 2) {
221 m_is_sorted = true;
222 return;
223 }
224
225 // parallel sort only if this is big enough of a vector
226 if (m_tof_bin.size() < MIN_VEC_LENGTH_PARALLEL_SORT)
227 std::sort(m_tof_bin.begin(), m_tof_bin.end());
228 else
229 tbb::parallel_sort(m_tof_bin.begin(), m_tof_bin.end());
230 m_is_sorted = true;
231 }
232
233 void createWeightedEvents(std::vector<Mantid::DataObjects::WeightedEventNoTime> *raw_events) const override {
234 // clean out previous version
235 raw_events->clear();
236
237 if (m_tof_bin.empty()) {
238 m_is_sorted = true; // nothing to do
239 } else if (m_tof_bin.size() == 1) {
240 m_is_sorted = true;
241 // don't bother with temporary objects and such if there is only one event
242 const auto tof = this->getBinCenter(m_tof_bin.front());
243 raw_events->emplace_back(tof, 1., 1.);
244 } else if (m_tof_bin.size() < (MAX_EVENTS / 10)) { // 10 was found to give good performance
245 // this branch removes events as it accumulates them. It does not assume that the time-of-flight is sorted
246 // before starting so the output will not be sorted either
247 auto last_end = m_tof_bin.end();
248 while (std::distance(m_tof_bin.begin(), last_end) > 0) {
249 // start with the last value so there is less movement since remove moves everything to the end of the range
250 const auto bin = *(last_end - 1); // one before the last end
251
252 // move everything with this value to the end
253 // vector::erase would finally remove the elements, but that takes more time
254 auto new_end = std::remove(m_tof_bin.begin(), last_end, bin);
255
256 // number of values moved is the number of counts - which are stored as float
257 const auto counts = static_cast<float>(std::distance(new_end, last_end));
258
259 // move the end to the new location
260 last_end = new_end;
261
262 // add the event
263 const auto tof = this->getBinCenter(bin);
264 raw_events->emplace_back(tof, counts, counts);
265 }
266 } else {
267 // blindly assume ratio of raw events which are compressed to a single weighed event on average
268 raw_events->reserve(m_tof_bin.size() / EXP_COMRESS_RATIO);
269
270 // accumulation method is different for "large" number of events
271 this->sort(); // sort big lists
272
273 // code below assumes the tofs are sorted
274
275 auto iter = m_tof_bin.cbegin();
276 const auto iter_end = m_tof_bin.cend();
277 while (iter != iter_end) {
278 auto range = std::equal_range(iter, iter_end, *iter);
279 // distance counts one less than we need for counts - which are stored as float
280 const auto counts = static_cast<float>(std::distance(range.first, range.second));
281 if (counts > 0.) {
282 const auto tof = this->getBinCenter(*iter);
283 raw_events->emplace_back(tof, counts, counts);
284 }
285 iter = range.second;
286 }
287 }
288 }
289
290 DataObjects::EventSortType getSortType() const override {
291 if (m_is_sorted)
293 else
295 }
296
297 static constexpr size_t MAX_EVENTS{20000};
298
299private:
300 mutable bool m_is_sorted;
301
302 // time-of-flight bin this data would go into
303 mutable std::vector<uint32_t> m_tof_bin;
304};
305
309class CompressDense : public CompressEventAccumulator {
310public:
311 // pass all arguments to the parent
312 CompressDense(std::shared_ptr<std::vector<double>> histogram_bin_edges, const double divisor,
313 CompressBinningMode bin_mode)
314 : CompressEventAccumulator(histogram_bin_edges, divisor, bin_mode) {}
315
316 double totalWeight() const override { return std::accumulate(m_count.cbegin(), m_count.cend(), 0.); }
317
321 void addEvent(const float tof) override {
322 if (!m_initialized) {
323 this->allocateFineHistogram();
324 m_initialized = true;
325 }
326
327 // add events
328 const auto &bin_optional = this->findBin(tof);
329 if (bin_optional) {
330 m_count[bin_optional.value()]++;
331 }
332 }
333
334 void createWeightedEvents(std::vector<Mantid::DataObjects::WeightedEventNoTime> *raw_events) const override {
335 if (m_count.empty())
336 return;
337
338 const auto NUM_BINS = m_count.size();
339 for (size_t i = 0; i < NUM_BINS; ++i) {
340 // underlying weight is stored as float
341 const auto counts = static_cast<float>(m_count[i]);
342 if (counts > 0.) {
343 const auto tof = this->getBinCenter(i);
344 raw_events->emplace_back(tof, counts, counts);
345 }
346 }
347 }
348
349 DataObjects::EventSortType getSortType() const override { return DataObjects::TOF_SORT; }
350
351private:
352 void allocateFineHistogram() {
353 const auto NUM_BINS = static_cast<size_t>(m_histogram_edges->size() - 1);
354 m_count.resize(NUM_BINS, 0);
355 }
356
358 std::vector<uint32_t> m_count;
359};
360
361} // namespace
362
363// ------------------------------------------------------------------------
364
366 std::shared_ptr<std::vector<double>> histogram_bin_edges, const double divisor, CompressBinningMode bin_mode)
367 : m_divisor(divisor), m_bin_mode(bin_mode), m_histogram_edges(std::move(histogram_bin_edges)) {}
368
369std::unique_ptr<CompressEventAccumulator> CompressEventAccumulatorFactory::create(const std::size_t num_events) {
370 const auto NUM_EDGES = m_histogram_edges->size();
371
372 /*
373 * The logic for which accumulator to use is a little selective, a little tuned, and a little arbitrary.
374 *
375 * When there are "lots" more events than bin edges, the dense case should use less memory. The threshold for this
376 * could probably be lowered, but the current place is a sensible mark.
377 *
378 * When there aren't many events, then the cost of converting each tof to a bin number is fairly cheap and the
379 * algorithm for creating a histogram from this is fast.
380 *
381 * For the rest of the cases, converting each tof to an int, especially for log-compression, is relatively expensive
382 * so accumulate the tof/float then sort and histogram the sorted list.
383 *
384 * The balance is in comparing convert to int + sort + histogram versus sort float + convert some to int + histogram
385 */
386
387 if (num_events > NUM_EDGES) {
388 // this is a dense array
389 return std::make_unique<CompressDense>(m_histogram_edges, m_divisor, m_bin_mode);
390 } else if (num_events < CompressSparseInt::MAX_EVENTS) { // somewhat arbitrary value
391 return std::make_unique<CompressSparseInt>(m_histogram_edges, num_events, m_divisor, m_bin_mode);
392 } else {
393 return std::make_unique<CompressSparseFloat>(m_histogram_edges, num_events, m_divisor, m_bin_mode);
394 }
395}
396
397} // namespace DataHandling
398} // namespace Mantid
std::vector< float > m_tof
sum of all time-of-flight within the bin
std::vector< uint32_t > m_tof_bin
std::vector< uint32_t > m_count
sum of all events seen in an individual bin
static constexpr size_t MAX_EVENTS
double left
double right
CompressEventAccumulatorFactory(std::shared_ptr< std::vector< double > > histogram_bin_edges, const double divisor, CompressBinningMode bin_mode)
std::unique_ptr< CompressEventAccumulator > create(const std::size_t num_events)
const std::shared_ptr< std::vector< double > > m_histogram_edges
std::optional< size_t >(* m_findBin)(const MantidVec &, const double, const double, const double, const bool)
function pointer on how to find the bin boundaries
std::optional< size_t > findBin(const float tof) const
CompressEventAccumulator(std::shared_ptr< std::vector< double > > histogram_bin_edges, const double divisor, CompressBinningMode bin_mode)
const std::shared_ptr< std::vector< double > > m_histogram_edges
shared pointer for the histogram bin boundaries
double m_divisor
keep track if the m_tof is already sorted
A class for holding :
Definition EventList.h:57
static std::optional< size_t > findLinearBin(const MantidVec &X, const double tof, const double divisor, const double offset, const bool findExact=true)
Find the bin which this TOF value falls in with linear binning, assumes TOF is in range of X.
static std::optional< size_t > findLogBin(const MantidVec &X, const double tof, const double divisor, const double offset, const bool findExact=true)
Find the bin which this TOF value falls in with log binning, assumes TOF is in range of X.
std::size_t numEvents(Nexus::File &file, bool &hasTotalCounts, bool &oldNeXusFileNames, const std::string &prefix, const Nexus::NexusDescriptor &descriptor)
Get the number of events in the currently opened group.
EventSortType
How the event list is sorted.
Definition EventList.h:32
Helper class which provides the Collimation Length for SANS instruments.
STL namespace.