2
* APTClasses.h - Generic APT components code
3
* Copyright (C) 2013 D Haley
5
* This program is free software: you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation, either version 3 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19
#include "APTClasses.h"
20
#include "../../common/stringFuncs.h"
22
#include "../../common/translation.h"
31
const char *POS_ERR_STRINGS[] = { "",
32
NTRANS("Memory allocation failure on POS load"),
33
NTRANS("Error opening pos file"),
34
NTRANS("Pos file empty"),
35
NTRANS("Pos file size appears to have non-integer number of entries"),
36
NTRANS("Error reading from pos file (after open)"),
37
NTRANS("Error - Found NaN in pos file"),
38
NTRANS("Pos load aborted by interrupt.")
46
TEXT_ERR_READ_CONTENTS,
50
TEXT_ERR_ENUM_END //not an error, just end of enum
53
const char *ION_TEXT_ERR_STRINGS[] = { "",
54
NTRANS("Error opening file"),
55
NTRANS("No numerical data found"),
56
NTRANS("Error re-opening file, after first scan"),
57
NTRANS("Unable to read file contents after open"),
58
NTRANS("Error interpreting field in file"),
59
NTRANS("Incorrect number of fields in file"),
60
NTRANS("Unable to allocate memory to store data"),
62
//!Create an pos file from a vector of IonHits
63
unsigned int IonVectorToPos(const vector<IonHit> &ionVec, const string &filename)
65
std::ofstream CFile(filename.c_str(),std::ios::binary);
71
for (unsigned int ui=0; ui<ionVec.size(); ui++)
73
ionVec[ui].makePosData(floatBuffer);
74
CFile.write((char *)floatBuffer,4*sizeof(float));
81
void appendPos(const vector<IonHit> &points, const char *name)
83
std::ofstream posFile(name,std::ios::binary|std::ios::app);
87
for(unsigned int ui=0; ui< points.size(); ui++)
89
points[ui].makePosData(data);
90
posFile.write((char *)data, 4*sizeof(float));
94
void getPointsFromIons(const vector<IonHit> &ions, vector<Point3D> &p)
96
p.resize(ions.size());
97
#pragma omp parallel for
98
for(size_t ui=0;ui<ions.size();ui++)
99
p[ui] = ions[ui].getPosRef();
102
unsigned int LimitLoadPosFile(unsigned int inputnumcols, unsigned int outputnumcols, unsigned int index[], vector<IonHit> &posIons,const char *posFile, size_t limitCount,
103
unsigned int &progress, bool (*callback)(bool),bool strongSampling)
106
//Function is only defined for 4 columns here.
107
ASSERT(outputnumcols == 4);
108
//buffersize must be a power of two and at least outputnumcols*sizeof(float)
109
const unsigned int NUMROWS=1;
110
const unsigned int BUFFERSIZE=inputnumcols * sizeof(float) * NUMROWS;
111
const unsigned int BUFFERSIZE2=outputnumcols * sizeof(float) * NUMROWS;
112
char *buffer=new char[BUFFERSIZE];
116
return POS_ALLOC_FAIL;
118
char *buffer2=new char[BUFFERSIZE2];
122
return POS_ALLOC_FAIL;
126
std::ifstream CFile(posFile,std::ios::binary);
132
return POS_OPEN_FAIL;
135
CFile.seekg(0,std::ios::end);
136
size_t fileSize=CFile.tellg();
142
return POS_EMPTY_FAIL;
145
CFile.seekg(0,std::ios::beg);
147
//calculate the number of points stored in the POS file
150
size_t maxCols = inputnumcols * sizeof(float);
153
if(fileSize % maxCols)
157
return POS_SIZE_MODULUS_ERR;
160
maxIons =fileSize/maxCols;
161
limitCount=std::min(limitCount,maxIons);
163
//If we are going to load the whole file, don't use a sampling method to do it.
164
if(limitCount == maxIons)
170
//Try opening it using the normal functions
171
return GenericLoadFloatFile(inputnumcols, outputnumcols, index, posIons,posFile,progress, callback);
174
//Use a sampling method to load the pos file
175
std::vector<size_t> ionsToLoad;
178
posIons.resize(limitCount);
183
randomDigitSelection(ionsToLoad,maxIons,rng,
184
limitCount,dummy,callback,strongSampling);
186
catch(std::bad_alloc)
190
return POS_ALLOC_FAIL;
195
GreaterWithCallback<size_t> g(callback,PROGRESS_REDUCE);
196
std::sort(ionsToLoad.begin(),ionsToLoad.end(),g);
198
unsigned int curProg = PROGRESS_REDUCE;
200
//TODO: probably not very nice to the disk drive. would be better to
201
//scan ahead for contiguous data regions, and load that where possible.
202
//Or switch between different algorithms based upon ionsToLoad.size()/
203
std::ios::pos_type nextIonPos;
204
for(size_t ui=0;ui<ionsToLoad.size(); ui++)
206
nextIonPos = ionsToLoad[ui]*maxCols;
208
if(CFile.tellg() !=nextIonPos )
209
CFile.seekg(nextIonPos);
211
CFile.read(buffer,BUFFERSIZE);
213
for (size_t i = 0; i < outputnumcols; i++) // iterate through floats
214
memcpy(&(buffer2[i * sizeof(float)]), &(buffer[index[i] * sizeof(float)]), sizeof(float));
217
return POS_READ_FAIL;
218
posIons[ui].setHit((float*)buffer2);
219
//Data bytes stored in pos files are big
220
//endian. flip as required
221
#ifdef __LITTLE_ENDIAN__
222
posIons[ui].switchEndian();
225
if(posIons[ui].hasNaN())
229
return POS_NAN_LOAD_ERROR;
236
progress= (unsigned int)((float)(CFile.tellg())/((float)fileSize)*100.0f);
237
curProg=PROGRESS_REDUCE;
238
if(!(*callback)(false))
242
return POS_ABORT_FAIL;
254
unsigned int GenericLoadFloatFile(unsigned int inputnumcols, unsigned int outputnumcols,
255
unsigned int index[], vector<IonHit> &posIons,const char *posFile,
256
unsigned int &progress, bool (*callback)(bool))
258
ASSERT(outputnumcols==4); //Due to ionHit.setHit
259
//buffersize must be a power of two and at least sizeof(float)*outputnumCols
260
const unsigned int NUMROWS=512;
261
const unsigned int BUFFERSIZE=inputnumcols * sizeof(float) * NUMROWS;
262
const unsigned int BUFFERSIZE2=outputnumcols * sizeof(float) * NUMROWS;
264
char *buffer=new char[BUFFERSIZE];
267
return POS_ALLOC_FAIL;
269
char *buffer2=new char[BUFFERSIZE2];
273
return POS_ALLOC_FAIL;
276
std::ifstream CFile(posFile,std::ios::binary);
282
return POS_OPEN_FAIL;
285
CFile.seekg(0,std::ios::end);
286
size_t fileSize=CFile.tellg();
292
return POS_EMPTY_FAIL;
295
CFile.seekg(0,std::ios::beg);
297
//calculate the number of points stored in the POS file
299
typedef struct IONHIT
306
size_t curBufferSize=BUFFERSIZE;
307
size_t curBufferSize2=BUFFERSIZE2;
309
if(fileSize % (inputnumcols * sizeof(float)))
313
return POS_SIZE_MODULUS_ERR;
318
posIons.resize(fileSize/(inputnumcols*sizeof(float)));
320
catch(std::bad_alloc)
324
return POS_ALLOC_FAIL;
328
while(fileSize < curBufferSize) {
329
curBufferSize = curBufferSize >> 1;
330
curBufferSize2 = curBufferSize2 >> 1;
333
//Technically this is dependent upon the buffer size.
334
unsigned int curProg = 10000;
336
int maxCols = inputnumcols * sizeof(float);
337
int maxPosCols = outputnumcols * sizeof(float);
340
//Taking curBufferSize chunks at a time, read the input file
341
while((size_t)CFile.tellg() <= fileSize-curBufferSize)
343
CFile.read(buffer,curBufferSize);
348
return POS_READ_FAIL;
351
for (unsigned int j = 0; j < NUMROWS; j++) // iterate through rows
353
for (unsigned int i = 0; i < outputnumcols; i++) // iterate through floats
355
memcpy(&(buffer2[j * maxPosCols + i * sizeof(float)]),
356
&(buffer[j * maxCols + index[i] * sizeof(float)]), sizeof(float));
361
for(ui=0; ui<curBufferSize2; ui+=(sizeof(IONHIT)))
363
hit.setHit((float*)(buffer2+ui));
364
//Data bytes stored in pos files are big
365
//endian. flip as required
366
#ifdef __LITTLE_ENDIAN__
374
return POS_NAN_LOAD_ERROR;
384
progress= (unsigned int)((float)(CFile.tellg())/((float)fileSize)*100.0f);
385
curProg=PROGRESS_REDUCE;
386
if(!(*callback)(false))
391
return POS_ABORT_FAIL;
398
curBufferSize = curBufferSize >> 1 ;
399
curBufferSize2 = curBufferSize2 >> 1 ;
400
}while(curBufferSize2 >= sizeof(IONHIT));
402
ASSERT((unsigned int)CFile.tellg() == fileSize);
411
unsigned int limitLoadTextFile(unsigned int numColsTotal, unsigned int selectedCols[],
412
vector<IonHit> &posIons,const char *textFile, const char *delim, const size_t limitCount,
413
unsigned int &progress, bool (*callback)(bool),bool strongRandom)
416
ASSERT(numColsTotal);
419
vector<size_t> newLinePositions;
420
std::vector<std::string> subStrs;
422
//Do a brute force scan through the dataset
423
//to locate newlines.
425
const int BUFFER_SIZE=16384; //This is totally a guess. I don't know what is best.
428
//sort the selected columns into increasing order
429
vector<int> sortedCols;
430
for(unsigned int ui=0;ui<numColsTotal;ui++)
431
sortedCols.push_back(selectedCols[ui]);
433
std::sort(sortedCols.begin(),sortedCols.end());
434
//the last entry in the sorted list tells us how many
435
// entries we need in each line at a minimum
436
unsigned int maxCol=sortedCols[sortedCols.size()-1];
438
ifstream CFile(textFile,std::ios::binary);
441
return TEXT_ERR_OPEN;
444
//seek to the end of the file
445
//to get the filesize
446
size_t maxPos,curPos;
447
CFile.seekg(0,std::ios::end);
448
maxPos=CFile.tellg();
452
CFile.open(textFile);
457
//Scan through file for end of header.
458
//we define this as the split value being able to generate
459
//1) Enough data to make interpretable columns
460
//2) Enough columns that can be interpreted.
461
while(!CFile.eof() && curPos < maxPos)
464
curPos = CFile.tellg();
468
return TEXT_ERR_READ_CONTENTS;
470
splitStrsRef(s.c_str(),delim,subStrs);
471
stripZeroEntries(subStrs);
473
//Not enough entries in this line
474
//to be interpretable
475
if(subStrs.size() <maxCol)
480
for(unsigned int ui=0;ui<numColsTotal; ui++)
482
if(selectedCols[ui] >=subStrs.size())
495
for(unsigned int ui=0; ui<numColsTotal; ui++)
498
if(stream_cast(f,subStrs[selectedCols[ui]]))
506
//well, we can stream this, so it assume it is not the header.
511
//could not find any data.. only header.
512
if(CFile.eof() || curPos >=maxPos)
513
return TEXT_ERR_ONLY_HEADER;
518
//Re-open the file in binary mode to find the newlines
519
CFile.open(textFile,std::ios::binary);
522
return TEXT_ERR_REOPEN;
525
//Jump to beyond the header
529
//keep a beginning of file marker
530
newLinePositions.push_back(curPos);
531
bool seenNumeric=false;
532
buffer = new char[BUFFER_SIZE];
533
while(!CFile.eof() && curPos < maxPos)
540
return TEXT_ERR_READ_CONTENTS;
542
//read up to BUFFER_SIZE bytes from the file
543
//but only if they are available
544
bytesToRead = std::min(maxPos-curPos,(size_t)BUFFER_SIZE);
546
CFile.read(buffer,bytesToRead);
548
//check that this buffer contains numeric info
549
for(unsigned int ui=0;ui<bytesToRead; ui++)
551
//Check for a unix-style endline
552
//or the latter part of a windows endline
553
//either or, whatever.
554
if( buffer[ui] == '\n')
556
//Check that we have not hit a run of non-numeric data
558
newLinePositions.push_back(ui+curPos);
560
else if(buffer[ui] >= '0' && buffer[ui] <='9')
568
//Don't keep any newline that hits the end of the file, but do keep a zero position
569
if(newLinePositions.size())
570
newLinePositions.pop_back();
574
//OK, so now we know where those pesky endlines are. This gives us jump positions
575
//to new lines in the file. Each component must have some form of numeric data
576
//preceding it. That numeric data may not be fully parseable, but we assume it is until we know better.
578
//If it is *not* parseable, just throw an error when we find that out.
580
//If we are going to load the whole file, don't use a sampling method to do it.
581
if(limitCount >=newLinePositions.size())
585
vector<vector<float> > data;
586
vector<string> header;
588
//Just use the non-sampling method to load.
589
if(loadTextData(textFile,data,header,delim))
590
return TEXT_ERR_FORMAT;
593
return TEXT_ERR_NUM_FIELDS;
595
posIons.resize(data[0].size());
596
for(size_t ui=0;ui<data[0].size();ui++)
599
ASSERT(numColsTotal==4);
600
//This is output specific
601
//and assumes that we have exactly 4 input cols
602
//to match ot our output vector of pos ions
603
posIons[ui].setPos(Point3D(data[selectedCols[0]][ui],
604
data[selectedCols[1]][ui],
605
data[selectedCols[2]][ui]));
606
posIons[ui].setMassToCharge(data[selectedCols[3]][ui]);
612
//Generate some random positions to load
613
std::vector<size_t> dataToLoad;
616
posIons.resize(limitCount);
621
randomDigitSelection(dataToLoad,newLinePositions.size(),rng,
622
limitCount,dummy,callback,strongRandom);
624
catch(std::bad_alloc)
627
return TEXT_ERR_ALLOC_FAIL;
631
//Sort the data such that we are going to
632
//always jump forwards in the file; better disk access and whatnot.
633
GreaterWithCallback<size_t> g(callback,PROGRESS_REDUCE);
634
std::sort(dataToLoad.begin(),dataToLoad.end(),g);
636
//OK, so we have a list of newlines
637
//that we can use as entry points for random seek.
638
//We also have some random entry points (sorted).
639
// Now re-open the file in text mode and try to load the
640
// data specified at the offsets
643
//Open file in text mode
644
CFile.open(textFile);
649
return TEXT_ERR_REOPEN;
653
//OK, now jump to each of the random positions,
654
//as dictated by the endline position
655
//and attempt a parsing there.
658
for(size_t ui=0;ui<dataToLoad.size();ui++)
661
std::ios::pos_type nextIonPos;
662
//Jump to position immediately after the newline
663
nextIonPos = (newLinePositions[dataToLoad[ui]]+1);
665
if(CFile.tellg() !=nextIonPos )
666
CFile.seekg(nextIonPos);
672
//Now attempt to scan the line for the selected columns.
673
//split around whatever delimiter we can find
674
splitStrsRef(s.c_str(),delim,subStrs);
676
if(subStrs.size() < numColsTotal)
678
//FIXME: Allow skipping of bad lines
680
return TEXT_ERR_NUM_FIELDS;
685
for(size_t uj=0;uj<sortedCols.size();uj++)
688
if(stream_cast(f[uj],subStrs[sortedCols[uj]]))
690
//FIXME: Allow skipping bad lines
691
//Can't parse line.. Abort.
693
return TEXT_ERR_FORMAT;
697
ASSERT(ui<posIons.size());
698
posIons[ui].setHit(f);
708
//At this point I deliberately don't initialise the point class
709
//as in DEBUG mode, the point class will catch failure to init
712
IonHit::IonHit(const IonHit &obj2) : massToCharge(obj2.massToCharge), pos(obj2.pos)
716
IonHit::IonHit(const Point3D &p, float newMass) : massToCharge(newMass), pos(p)
720
void IonHit::setMassToCharge(float newMass)
722
massToCharge=newMass;
725
float IonHit::getMassToCharge() const
731
void IonHit::setPos(const Point3D &p)
736
#ifdef __LITTLE_ENDIAN__
737
void IonHit::switchEndian()
741
floatSwapBytes(&(massToCharge));
745
const IonHit &IonHit::operator=(const IonHit &obj)
747
massToCharge=obj.massToCharge;
753
IonHit IonHit::operator+(const Point3D &obj)
755
//FIXME: I think this is wrong???
761
void IonHit::makePosData(float *floatArr) const
764
//copy positional information
765
pos.copyValueArr(floatArr);
767
//copy mass to charge data
768
*(floatArr+3) = massToCharge;
770
#ifdef __LITTLE_ENDIAN__
771
floatSwapBytes(floatArr);
772
floatSwapBytes((floatArr+1));
773
floatSwapBytes((floatArr+2));
774
floatSwapBytes((floatArr+3));
778
Point3D IonHit::getPos() const
783
bool IonHit::hasNaN()
785
return (std::isnan(massToCharge) || std::isnan(pos[0]) ||
786
std::isnan(pos[1]) || std::isnan(pos[2]));
793
void getPointSum(const std::vector<IonHit> &points,Point3D ¢roid)
795
//TODO: Paralellise me
796
centroid=Point3D(0,0,0);
797
for(unsigned int ui=0;ui<points.size();ui++)
798
centroid+=points[ui].getPos();
801
BoundCube getIonDataLimits(const std::vector<IonHit> &points)
803
ASSERT(points.size());
806
b.setInverseLimits();
809
for(unsigned int ui=0;ui<3;ui++)
811
bounds[ui][0]=std::numeric_limits<float>::max();
812
bounds[ui][1]=-std::numeric_limits<float>::max();
815
for(unsigned int ui=0; ui<points.size(); ui++)
818
for(unsigned int uj=0; uj<3; uj++)
821
p=points[ui].getPos();
822
if(p.getValue(uj) < bounds[uj][0])
823
bounds[uj][0] = p.getValue(uj);
825
if(p.getValue(uj) > bounds[uj][1])
826
bounds[uj][1] = p.getValue(uj);
830
b.setBounds(bounds[0][0],bounds[1][0],
831
bounds[2][0],bounds[0][1],
832
bounds[1][1],bounds[2][1]);
835
vector<BoundCube> cubes;
837
unsigned int nT=omp_get_max_threads();
839
for(unsigned int ui=0;ui<cubes.size();ui++)
840
cube[ui].setInverseLimits();
842
unsigned int tCount=1;
843
#pragma omp parallel for reduction(tCount|+)
844
for(unsigned int ui=0;ui<points.size();ui++)
847
p=points[ui].getPos();
848
for(unsigned int uj=0;uj<3;uj++)
850
b.setBounds(uj,0,std::min(b.getBound(uj,0),p[uj]));
851
b.setBounds(uj,1,std::min(b.getBound(uj,0),p[uj]));
855
for(unsigned int ui=0;ui<std::min(tCount,nT);ui++)