Mantid
Loading...
Searching...
No Matches
MergeMDFiles.cpp
Go to the documentation of this file.
1// Mantid Repository : https://github.com/mantidproject/mantid
2//
3// Copyright © 2018 ISIS Rutherford Appleton Laboratory UKRI,
4// NScD Oak Ridge National Laboratory, European Spallation Source,
5// Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS
6// SPDX - License - Identifier: GPL - 3.0 +
15#include "MantidKernel/System.h"
17
18#include <Poco/File.h>
19#include <boost/scoped_ptr.hpp>
20
21using namespace Mantid::Kernel;
22using namespace Mantid::API;
23using namespace Mantid::DataObjects;
24
25namespace Mantid::MDAlgorithms {
26
27// Register the algorithm into the AlgorithmFactory
28DECLARE_ALGORITHM(MergeMDFiles)
29
30//----------------------------------------------------------------------------------------------
34 : m_nDims(0), m_MDEventType(), m_fileBasedTargetWS(false), m_Filenames(), m_EventLoader(), m_OutIWS(),
35 m_totalEvents(0), m_totalLoaded(0), m_fileMutex(), m_statsMutex() {}
36
37//----------------------------------------------------------------------------------------------
41
42//----------------------------------------------------------------------------------------------
43
44//----------------------------------------------------------------------------------------------
48 std::vector<std::string> exts(1, ".nxs");
49 declareProperty(std::make_unique<MultipleFileProperty>("Filenames", exts),
50 "Select several MDEventWorkspace NXS files to merge "
51 "together. Files must have common box structure.");
52
53 declareProperty(std::make_unique<FileProperty>("OutputFilename", "", FileProperty::OptionalSave, exts),
54 "Choose a file to which to save the output workspace. \n"
55 "Optional: if specified, the workspace created will be file-backed. \n"
56 "If not, it will be created in memory.");
57
58 declareProperty("Parallel", false,
59 "Run the loading tasks in parallel.\n"
60 "This can be faster but might use more memory.");
61
62 declareProperty(std::make_unique<WorkspaceProperty<IMDEventWorkspace>>("OutputWorkspace", "", Direction::Output),
63 "An output MDEventWorkspace.");
64}
65
66//----------------------------------------------------------------------------------------------
71 this->progress(0.05, "Loading File Info");
72 // Get plain box structure and box tree
73 std::vector<API::IMDNode *> &Boxes = m_BoxStruct.getBoxes();
74 std::vector<uint64_t> &targetEventIndexes = m_BoxStruct.getEventIndex();
75 // clear the averages for target event indexes;
76 targetEventIndexes.assign(targetEventIndexes.size(), 0);
77
78 // Total number of events in ALL files.
79 m_totalEvents = 0;
80
82 m_EventLoader.assign(m_Filenames.size(), nullptr);
83
84 try {
85 for (size_t i = 0; i < m_Filenames.size(); i++) {
86 // load box structure and the experimental info from each target
87 // workspace.
88 m_fileComponentsStructure[i].loadBoxStructure(m_Filenames[i], m_nDims, m_MDEventType, true, true);
89 // export just loaded experiment info to the target workspace
90 m_fileComponentsStructure[i].exportExperiment(m_OutIWS);
91
92 // Check for consistency
93 if (i > 0) {
94 if (m_fileComponentsStructure[i].getEventIndex().size() != targetEventIndexes.size())
95 throw std::runtime_error("Inconsistent number of boxes found in file " + m_Filenames[i] +
96 ". Cannot merge these files. Did you generate them all with "
97 "exactly the same box structure?");
98 }
99
100 // calculate total number of events per target cell, which will be
101 size_t nBoxes = Boxes.size();
102 for (size_t j = 0; j < nBoxes; j++) {
103 size_t ID = Boxes[j]->getID();
104 targetEventIndexes[2 * ID + 1] += m_fileComponentsStructure[i].getEventIndex()[2 * ID + 1];
105 m_totalEvents += m_fileComponentsStructure[i].getEventIndex()[2 * ID + 1];
106 }
107
108 // Open the event data, track the total number of events
109 auto bc = std::shared_ptr<API::BoxController>(new API::BoxController(static_cast<size_t>(m_nDims)));
110 bc->fromXMLString(m_fileComponentsStructure[i].getBCXMLdescr());
111
112 m_EventLoader[i] = new BoxControllerNeXusIO(bc.get());
113 m_EventLoader[i]->setDataType(sizeof(coord_t), m_MDEventType);
114 m_EventLoader[i]->openFile(m_Filenames[i], "r");
115 }
116 } catch (...) {
117 // Close all open files in case of error
119 throw;
120 }
121
122 const std::vector<int> &boxType = m_BoxStruct.getBoxType();
123 // calculate event positions in the target file.
124 uint64_t eventsStart = 0;
125 for (auto mdBox : Boxes) {
126 mdBox->clear();
127 size_t ID = mdBox->getID();
128
129 // avoid grid boxes;
130 if (boxType[ID] == 2)
131 continue;
132
133 uint64_t nEvents = targetEventIndexes[2 * ID + 1];
134 targetEventIndexes[ID * 2] = eventsStart;
136 mdBox->setFileBacked(eventsStart, nEvents, false);
137
138 eventsStart += nEvents;
139 }
140
141 g_log.notice() << m_totalEvents << " events in " << m_Filenames.size() << " files.\n";
142}
143
151 TargetBox->clear();
152
153 uint64_t nBoxEvents(0);
154 std::vector<size_t> numFileEvents(m_EventLoader.size());
155
156 for (size_t iw = 0; iw < this->m_EventLoader.size(); iw++) {
157 size_t ID = TargetBox->getID();
158 numFileEvents[iw] = static_cast<size_t>(m_fileComponentsStructure[iw].getEventIndex()[2 * ID + 1]);
159 nBoxEvents += numFileEvents[iw];
160 }
161
162 // At this point memory required is known, so it is reserved all in one go
163 TargetBox->reserveMemoryForLoad(nBoxEvents);
164
165 for (size_t iw = 0; iw < this->m_EventLoader.size(); iw++) {
166 size_t ID = TargetBox->getID();
167 uint64_t fileLocation = m_fileComponentsStructure[iw].getEventIndex()[2 * ID + 0];
168 if (numFileEvents[iw] == 0)
169 continue;
170 TargetBox->loadAndAddFrom(m_EventLoader[iw], fileLocation, numFileEvents[iw]);
171 }
172
173 return nBoxEvents;
174}
175
176//----------------------------------------------------------------------------------------------
185void MergeMDFiles::doExecByCloning(const Mantid::API::IMDEventWorkspace_sptr &ws, const std::string &outputFile) {
186 m_OutIWS = ws;
187 m_MDEventType = ws->getEventTypeName();
188
189 // Run the tasks in parallel? TODO: enable
190 // bool Parallel = this->getProperty("Parallel");
191
192 // Fix the box controller settings in the output workspace so that it splits
193 // normally
194 BoxController_sptr bc = ws->getBoxController();
195 // set up internal variables characterizing the workspace.
196 m_nDims = static_cast<int>(bc->getNDims());
197
198 // Fix the max depth to something bigger.
199 bc->setMaxDepth(20);
200 bc->setSplitThreshold(5000);
201 auto saver = std::shared_ptr<API::IBoxControllerIO>(new DataObjects::BoxControllerNeXusIO(bc.get()));
202 saver->setDataType(sizeof(coord_t), m_MDEventType);
204 bc->setFileBacked(saver, outputFile);
205 // Complete the file-back-end creation.
206 g_log.notice() << "Setting cache to 400 MB write.\n";
207 bc->getFileIO()->setWriteBufferSize(400000000 / m_OutIWS->sizeofEvent());
208 }
209
210 /* else
211 {
212 saver->openFile(outputFile,"w");
213 }*/
214 // Init box structure used for memory/file space calculations
215 m_BoxStruct.initFlatStructure(ws, outputFile);
216
217 // First, load all the box data and experiment info and calculate file
218 // positions of the target workspace
219 this->loadBoxData();
220
221 size_t numBoxes = m_BoxStruct.getNBoxes();
222 // Progress report based on events processed.
223 m_progress = std::make_unique<Progress>(this, 0.1, 0.9, size_t(numBoxes));
224 m_progress->setNotifyStep(0.1);
225
226 // For tracking progress
227 // uint64_t m_totalEventsInTasks = 0;
228
229 // Prepare thread pool
230 CPUTimer overallTime;
231
232 auto ts = new ThreadSchedulerFIFO();
233 ThreadPool tp(ts);
234
235 Kernel::DiskBuffer *DiskBuf(nullptr);
237 DiskBuf = bc->getFileIO();
238 }
239
240 this->m_totalLoaded = 0;
241 const std::vector<API::IMDNode *> &boxes = m_BoxStruct.getBoxes();
242
243 for (size_t ib = 0; ib < numBoxes; ib++) {
244 auto box = boxes[ib];
245 if (!box->isBox())
246 continue;
247 // load all contributed events into current box;
248 this->loadEventsFromSubBoxes(box);
249
250 if (DiskBuf) {
251 if (box->getDataInMemorySize() > 0) { // data position has been already pre-calculated
252 box->getISaveable()->save();
253 box->clearDataFromMemory();
254 // Kernel::ISaveable *Saver = box->getISaveable();
255 // DiskBuf->toWrite(Saver);
256 }
257 }
258 // else
259 //{ size_t ID = box->getID();
260 // uint64_t filePosition = targetEventIndexes[2*ID];
261 // box->saveAt(saver.get(), filePosition);
262 //}
263 //
264 // if (!Parallel)
265 //{
266 // // Run the task serially only
267 // task->run();
268 // delete task;
269 //}
270 // else
271 //{
272 // // Enqueue to run in parallel (at the joinAll() call below).
273 // ts->push(task);
274 //}
275
276 m_progress->reportIncrement(ib, "Loading and merging box data");
277 }
278 if (DiskBuf) {
279 DiskBuf->flushCache();
280 bc->getFileIO()->flushData();
281 }
283 // tp.joinAll();
284 g_log.information() << overallTime << " to do all the adding.\n";
285
286 // Close any open file handle
288
289 // Finish things up
290 this->finalizeOutput(outputFile);
291}
292
293//----------------------------------------------------------------------------------------------
295void MergeMDFiles::finalizeOutput(const std::string &outputFile) {
296 CPUTimer overallTime;
297
298 this->progress(0.90, "Refreshing Cache");
299 m_OutIWS->refreshCache();
300
301 g_log.information() << overallTime << " to run refreshCache().\n";
302
303 if (!outputFile.empty()) {
304 g_log.notice() << "Starting SaveMD to update the file back-end.\n";
305 // create or open WS group and put there additional information about WS and
306 // its dimensions
307 bool old_data_there;
308 // clang-format off
309 boost::scoped_ptr< ::NeXus::File> file(MDBoxFlatTree::createOrOpenMDWSgroup(
310 outputFile, m_nDims, m_MDEventType, false, old_data_there));
311 // clang-format on
312 this->progress(0.94, "Saving ws history and dimensions");
314 // Save each ExperimentInfo to a spot in the file
315 this->progress(0.98, "Saving experiment infos");
317
318 file->closeGroup();
319 file->close();
320 // -------------- Save Box Structure -------------------------------------
321 // OK, we've filled these big arrays of data representing flat box
322 // structure. Save them.
323 progress(0.91, "Writing Box Data");
324 m_progress->resetNumSteps(8, 0.92, 1.00);
325
326 // Save box structure;
327 m_BoxStruct.saveBoxStructure(outputFile);
328
329 g_log.information() << overallTime << " to run SaveMD structure\n";
330 }
331}
332
333//----------------------------------------------------------------------------------------------
337 // clear disk buffer which can remain from previous runs
338 // the existence/ usage of the buffer indicates if the algorithm works with
339 // file based or memory based target workspaces;
340 // pDiskBuffer = NULL;
341 MultipleFileProperty *multiFileProp = dynamic_cast<MultipleFileProperty *>(getPointerToProperty("Filenames"));
342 if (!multiFileProp) {
343 throw std::logic_error("Filenames property must have MultipleFileProperty type.");
344 }
345 m_Filenames = VectorHelper::flattenVector(multiFileProp->operator()());
346 if (m_Filenames.empty())
347 throw std::invalid_argument("Must specify at least one filename.");
348 std::string firstFile = m_Filenames[0];
349
350 std::string outputFile = getProperty("OutputFilename");
351 m_fileBasedTargetWS = false;
352 if (!outputFile.empty()) {
353 m_fileBasedTargetWS = true;
354 if (Poco::File(outputFile).exists())
355 throw std::invalid_argument(" File " + outputFile +
356 " already exists. Can not use existing file "
357 "as the target to MergeMD files.\n" +
358 " Use it as one of source files if you want to add MD data to it");
359 }
360
361 // Start by loading the first file but just the box structure, no events, and
362 // not file-backed
363 // m_BoxStruct.loadBoxStructure(firstFile,
364 auto loader = createChildAlgorithm("LoadMD", 0.0, 0.05, false);
365 loader->setPropertyValue("Filename", firstFile);
366 loader->setProperty("MetadataOnly", false);
367 loader->setProperty("BoxStructureOnly", true);
368 loader->setProperty("FileBackEnd", false);
369 loader->executeAsChildAlg();
370 IMDWorkspace_sptr result = (loader->getProperty("OutputWorkspace"));
371
372 auto firstWS = std::dynamic_pointer_cast<API::IMDEventWorkspace>(result);
373 if (!firstWS)
374 throw std::runtime_error("Can not load MDEventWorkspace from initial file " + firstFile);
375
376 // do the job
377 this->doExecByCloning(firstWS, outputFile);
378
379 m_OutIWS->setFileNeedsUpdating(false);
380
381 setProperty("OutputWorkspace", m_OutIWS);
382}
385 for (auto &loader : m_EventLoader) {
386 delete loader;
387 loader = nullptr;
388 }
389}
390
391} // namespace Mantid::MDAlgorithms
#define DECLARE_ALGORITHM(classname)
Definition: Algorithm.h:576
static std::unique_ptr< QThreadPool > tp
void declareProperty(std::unique_ptr< Kernel::Property > p, const std::string &doc="") override
Add a property to the list of managed properties.
Definition: Algorithm.cpp:1913
Kernel::Property * getPointerToProperty(const std::string &name) const override
Get a property by name.
Definition: Algorithm.cpp:2033
TypedValue getProperty(const std::string &name) const override
Get the value of a property.
Definition: Algorithm.cpp:2076
virtual std::shared_ptr< Algorithm > createChildAlgorithm(const std::string &name, const double startProgress=-1., const double endProgress=-1., const bool enableLogging=true, const int &version=-1)
Create a Child Algorithm.
Definition: Algorithm.cpp:842
Kernel::Logger & g_log
Definition: Algorithm.h:451
void progress(double p, const std::string &msg="", double estimatedTime=0.0, int progressPrecision=0)
Sends ProgressNotification.
Definition: Algorithm.cpp:231
This class is used by MDBox and MDGridBox in order to intelligently determine optimal behavior.
Definition: BoxController.h:33
@ OptionalSave
to specify a file to write to but an empty string is
Definition: FileProperty.h:50
virtual void loadAndAddFrom(API::IBoxControllerIO *const, uint64_t, size_t, std::vector< coord_t > &)=0
Load the additional box data of specified size from the disk location provided using the class,...
virtual void clear()=0
Clear all contained data including precalculated averages.
virtual size_t getID() const =0
virtual void reserveMemoryForLoad(uint64_t)=0
A property to allow a user to specify multiple files to load.
A property class for workspaces.
The class responsible for saving events into nexus file using generic box controller interface Expect...
std::vector< API::IMDNode * > & getBoxes()
Definition: MDBoxFlatTree.h:38
void initFlatStructure(const API::IMDEventWorkspace_sptr &pws, const std::string &fileName)
convert MDWS box structure into flat structure used for saving/loading on hdd
static void saveWSGenericInfo(::NeXus::File *const file, const API::IMDWorkspace_const_sptr &ws)
Save workspace generic info like dimension structure, history, title dimensions etc.
std::vector< uint64_t > & getEventIndex()
Definition: MDBoxFlatTree.h:45
const std::vector< int > & getBoxType() const
Definition: MDBoxFlatTree.h:46
static void saveExperimentInfos(::NeXus::File *const file, const API::IMDEventWorkspace_const_sptr &ws)
Save each NEW ExperimentInfo to a spot in the file.
void saveBoxStructure(const std::string &fileName)
Save flat box structure into a file, defined by the file name.
::NeXus::File * createOrOpenMDWSgroup(const std::string &fileName, int &nDims, const std::string &WSEventType, bool readOnly, bool &alreadyExists)
The function to create a NeXus MD workspace group with specified events type and number of dimensions...
CPUTimer : Timer that uses the CPU time, rather than wall-clock time to measure execution time.
Definition: CPUTimer.h:24
Buffer objects that need to be written out to disk so as to optimize writing operations.
Definition: DiskBuffer.h:42
void flushCache()
Flush out all the data in the memory; and writes out everything in the to-write cache.
Definition: DiskBuffer.cpp:192
IPropertyManager * setProperty(const std::string &name, const T &value)
Templated method to set the value of a PropertyWithValue.
void notice(const std::string &msg)
Logs at notice level.
Definition: Logger.cpp:95
void information(const std::string &msg)
Logs at information level.
Definition: Logger.cpp:105
A Thread Pool implementation that keeps a certain number of threads running (normally,...
Definition: ThreadPool.h:36
A First-In-First-Out Thread Scheduler.
Algorithm to merge multiple MDEventWorkspaces from files that obey a common box format.
Definition: MergeMDFiles.h:26
std::string m_MDEventType
string describes type of the event, stored in the workspaces.
Definition: MergeMDFiles.h:72
void doExecByCloning(const Mantid::API::IMDEventWorkspace_sptr &ws, const std::string &outputFile)
Perform the merging, but clone the initial workspace and use the same splitting as its structure is e...
std::vector< std::string > m_Filenames
Files to load.
Definition: MergeMDFiles.h:77
std::vector< API::IBoxControllerIO * > m_EventLoader
Vector of file handles to each input file //TODO unique?
Definition: MergeMDFiles.h:80
uint64_t loadEventsFromSubBoxes(API::IMDNode *TargetBox)
Task that loads all of the events from corresponded boxes of all files that is being merged into a pa...
void finalizeOutput(const std::string &outputFile)
Now re-save the MDEventWorkspace to update the file back end.
std::unique_ptr< Mantid::API::Progress > m_progress
Progress reporter.
Definition: MergeMDFiles.h:98
int m_nDims
number of workspace dimensions
Definition: MergeMDFiles.h:70
~MergeMDFiles() override
Destructor.
bool m_fileBasedTargetWS
if the workspace is indeed file-based
Definition: MergeMDFiles.h:75
DataObjects::MDBoxFlatTree m_BoxStruct
Definition: MergeMDFiles.h:60
std::vector< DataObjects::MDBoxFlatTree > m_fileComponentsStructure
Definition: MergeMDFiles.h:62
void exec() override
Run the algorithm.
Mantid::API::IMDEventWorkspace_sptr m_OutIWS
Output IMDEventWorkspace.
Definition: MergeMDFiles.h:83
void clearEventLoaders()
Set to true if the output is cloned of the first one.
void init() override
Initialise the properties.
void loadBoxData()
Loads all of the box data required (no events) for later use.
std::shared_ptr< IMDEventWorkspace > IMDEventWorkspace_sptr
Shared pointer to Mantid::API::IMDEventWorkspace.
std::shared_ptr< IMDWorkspace > IMDWorkspace_sptr
Shared pointer to the IMDWorkspace base class.
Definition: IMDWorkspace.h:146
std::shared_ptr< BoxController > BoxController_sptr
Shared ptr to BoxController.
bool exists(::NeXus::File &file, const std::string &name)
Based on the current group in the file, does the named sub-entry exist?
std::vector< T > flattenVector(const std::vector< std::vector< T > > &v)
A convenience function to "flatten" the given vector of vectors into a single vector.
Definition: VectorHelper.h:69
float coord_t
Typedef for the data type to use for coordinate axes in MD objects such as MDBox, MDEventWorkspace,...
Definition: MDTypes.h:27
@ Output
An output workspace.
Definition: Property.h:54