1
/*M///////////////////////////////////////////////////////////////////////////////////////
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
10
// Intel License Agreement
12
// Copyright (C) 2000, Intel Corporation, all rights reserved.
13
// Third party copyrights are property of their respective owners.
15
// Redistribution and use in source and binary forms, with or without modification,
16
// are permitted provided that the following conditions are met:
18
// * Redistribution's of source code must retain the above copyright notice,
19
// this list of conditions and the following disclaimer.
21
// * Redistribution's in binary form must reproduce the above copyright notice,
22
// this list of conditions and the following disclaimer in the documentation
23
// and/or other materials provided with the distribution.
25
// * The name of Intel Corporation may not be used to endorse or promote products
26
// derived from this software without specific prior written permission.
28
// This software is provided by the copyright holders and contributors "as is" and
29
// any express or implied warranties, including, but not limited to, the implied
30
// warranties of merchantability and fitness for a particular purpose are disclaimed.
31
// In no event shall the Intel Corporation or contributors be liable for any direct,
32
// indirect, incidental, special, exemplary, or consequential damages
33
// (including, but not limited to, procurement of substitute goods or services;
34
// loss of use, data, or profits; or business interruption) however caused
35
// and on any theory of liability, whether in contract, strict liability,
36
// or tort (including negligence or otherwise) arising in any way out of
37
// the use of this software, even if advised of the possibility of such damage.
41
#include "precomp.hpp"
46
namespace cv { namespace ml {
48
static const float MISSED_VAL = TrainData::missingValue();
49
static const int VAR_MISSED = VAR_ORDERED;
51
TrainData::~TrainData() {}
53
Mat TrainData::getSubVector(const Mat& vec, const Mat& idx)
57
int i, j, n = idx.checkVector(1, CV_32S);
58
int type = vec.type();
59
CV_Assert( type == CV_32S || type == CV_32F || type == CV_64F );
62
if( vec.cols == 1 || vec.rows == 1 )
65
m = vec.cols + vec.rows - 1;
76
subvec.create(dims, n, type);
78
subvec.create(n, dims, type);
80
for( i = 0; i < n; i++ )
82
int k = idx.at<int>(i);
83
CV_Assert( 0 <= k && k < m );
85
subvec.at<int>(i) = vec.at<int>(k);
87
for( j = 0; j < dims; j++ )
88
subvec.at<int>(i, j) = vec.at<int>(k, j);
90
else if( type == CV_32F )
91
for( i = 0; i < n; i++ )
93
int k = idx.at<int>(i);
94
CV_Assert( 0 <= k && k < m );
96
subvec.at<float>(i) = vec.at<float>(k);
98
for( j = 0; j < dims; j++ )
99
subvec.at<float>(i, j) = vec.at<float>(k, j);
102
for( i = 0; i < n; i++ )
104
int k = idx.at<int>(i);
105
CV_Assert( 0 <= k && k < m );
107
subvec.at<double>(i) = vec.at<double>(k);
109
for( j = 0; j < dims; j++ )
110
subvec.at<double>(i, j) = vec.at<double>(k, j);
115
class TrainDataImpl : public TrainData
118
typedef std::map<String, int> MapType;
126
virtual ~TrainDataImpl() { closeFile(); }
128
int getLayout() const { return layout; }
129
int getNSamples() const
131
return !sampleIdx.empty() ? (int)sampleIdx.total() :
132
layout == ROW_SAMPLE ? samples.rows : samples.cols;
134
int getNTrainSamples() const
136
return !trainSampleIdx.empty() ? (int)trainSampleIdx.total() : getNSamples();
138
int getNTestSamples() const
140
return !testSampleIdx.empty() ? (int)testSampleIdx.total() : 0;
144
return !varIdx.empty() ? (int)varIdx.total() : getNAllVars();
146
int getNAllVars() const
148
return layout == ROW_SAMPLE ? samples.cols : samples.rows;
151
Mat getSamples() const { return samples; }
152
Mat getResponses() const { return responses; }
153
Mat getMissing() const { return missing; }
154
Mat getVarIdx() const { return varIdx; }
155
Mat getVarType() const { return varType; }
156
int getResponseType() const
158
return classLabels.empty() ? VAR_ORDERED : VAR_CATEGORICAL;
160
Mat getTrainSampleIdx() const { return !trainSampleIdx.empty() ? trainSampleIdx : sampleIdx; }
161
Mat getTestSampleIdx() const { return testSampleIdx; }
162
Mat getSampleWeights() const
164
return sampleWeights;
166
Mat getTrainSampleWeights() const
168
return getSubVector(sampleWeights, getTrainSampleIdx());
170
Mat getTestSampleWeights() const
172
Mat idx = getTestSampleIdx();
173
return idx.empty() ? Mat() : getSubVector(sampleWeights, idx);
175
Mat getTrainResponses() const
177
return getSubVector(responses, getTrainSampleIdx());
179
Mat getTrainNormCatResponses() const
181
return getSubVector(normCatResponses, getTrainSampleIdx());
183
Mat getTestResponses() const
185
Mat idx = getTestSampleIdx();
186
return idx.empty() ? Mat() : getSubVector(responses, idx);
188
Mat getTestNormCatResponses() const
190
Mat idx = getTestSampleIdx();
191
return idx.empty() ? Mat() : getSubVector(normCatResponses, idx);
193
Mat getNormCatResponses() const { return normCatResponses; }
194
Mat getClassLabels() const { return classLabels; }
195
Mat getClassCounters() const { return classCounters; }
196
int getCatCount(int vi) const
198
int n = (int)catOfs.total();
199
CV_Assert( 0 <= vi && vi < n );
200
Vec2i ofs = catOfs.at<Vec2i>(vi);
201
return ofs[1] - ofs[0];
204
Mat getCatOfs() const { return catOfs; }
205
Mat getCatMap() const { return catMap; }
207
Mat getDefaultSubstValues() const { return missingSubst; }
209
void closeFile() { if(file) fclose(file); file=0; }
218
trainSampleIdx.release();
219
testSampleIdx.release();
220
normCatResponses.release();
221
classLabels.release();
222
classCounters.release();
229
typedef std::map<int, int> CatMapHash;
231
void setData(InputArray _samples, int _layout, InputArray _responses,
232
InputArray _varIdx, InputArray _sampleIdx, InputArray _sampleWeights,
233
InputArray _varType, InputArray _missing)
237
CV_Assert(_layout == ROW_SAMPLE || _layout == COL_SAMPLE );
238
samples = _samples.getMat();
240
responses = _responses.getMat();
241
varIdx = _varIdx.getMat();
242
sampleIdx = _sampleIdx.getMat();
243
sampleWeights = _sampleWeights.getMat();
244
varType = _varType.getMat();
245
missing = _missing.getMat();
247
int nsamples = layout == ROW_SAMPLE ? samples.rows : samples.cols;
248
int ninputvars = layout == ROW_SAMPLE ? samples.cols : samples.rows;
249
int i, noutputvars = 0;
251
CV_Assert( samples.type() == CV_32F || samples.type() == CV_32S );
253
if( !sampleIdx.empty() )
255
CV_Assert( (sampleIdx.checkVector(1, CV_32S, true) > 0 &&
256
checkRange(sampleIdx, true, 0, 0, nsamples)) ||
257
sampleIdx.checkVector(1, CV_8U, true) == nsamples );
258
if( sampleIdx.type() == CV_8U )
259
sampleIdx = convertMaskToIdx(sampleIdx);
262
if( !sampleWeights.empty() )
264
CV_Assert( sampleWeights.checkVector(1, CV_32F, true) == nsamples );
268
sampleWeights = Mat::ones(nsamples, 1, CV_32F);
271
if( !varIdx.empty() )
273
CV_Assert( (varIdx.checkVector(1, CV_32S, true) > 0 &&
274
checkRange(varIdx, true, 0, 0, ninputvars)) ||
275
varIdx.checkVector(1, CV_8U, true) == ninputvars );
276
if( varIdx.type() == CV_8U )
277
varIdx = convertMaskToIdx(varIdx);
278
varIdx = varIdx.clone();
279
std::sort(varIdx.ptr<int>(), varIdx.ptr<int>() + varIdx.total());
282
if( !responses.empty() )
284
CV_Assert( responses.type() == CV_32F || responses.type() == CV_32S );
285
if( (responses.cols == 1 || responses.rows == 1) && (int)responses.total() == nsamples )
289
CV_Assert( (layout == ROW_SAMPLE && responses.rows == nsamples) ||
290
(layout == COL_SAMPLE && responses.cols == nsamples) );
291
noutputvars = layout == ROW_SAMPLE ? responses.cols : responses.rows;
293
if( !responses.isContinuous() || (layout == COL_SAMPLE && noutputvars > 1) )
296
transpose(responses, temp);
301
int nvars = ninputvars + noutputvars;
303
if( !varType.empty() )
305
CV_Assert( varType.checkVector(1, CV_8U, true) == nvars &&
306
checkRange(varType, true, 0, VAR_ORDERED, VAR_CATEGORICAL+1) );
310
varType.create(1, nvars, CV_8U);
311
varType = Scalar::all(VAR_ORDERED);
312
if( noutputvars == 1 )
313
varType.at<uchar>(ninputvars) = (uchar)(responses.type() < CV_32F ? VAR_CATEGORICAL : VAR_ORDERED);
316
if( noutputvars > 1 )
318
for( i = 0; i < noutputvars; i++ )
319
CV_Assert( varType.at<uchar>(ninputvars + i) == VAR_ORDERED );
322
catOfs = Mat::zeros(1, nvars, CV_32SC2);
323
missingSubst = Mat::zeros(1, nvars, CV_32F);
325
vector<int> labels, counters, sortbuf, tempCatMap;
326
vector<Vec2i> tempCatOfs;
329
AutoBuffer<uchar> buf(nsamples);
330
Mat non_missing(layout == ROW_SAMPLE ? Size(1, nsamples) : Size(nsamples, 1), CV_8U, (uchar*)buf);
331
bool haveMissing = !missing.empty();
334
CV_Assert( missing.size() == samples.size() && missing.type() == CV_8U );
337
// we iterate through all the variables. For each categorical variable we build a map
338
// in order to convert input values of the variable into normalized values (0..catcount_vi-1)
339
// often many categorical variables are similar, so we compress the map - try to re-use
340
// maps for different variables if they are identical
341
for( i = 0; i < ninputvars; i++ )
343
Mat values_i = layout == ROW_SAMPLE ? samples.col(i) : samples.row(i);
345
if( varType.at<uchar>(i) == VAR_CATEGORICAL )
347
preprocessCategorical(values_i, 0, labels, 0, sortbuf);
348
missingSubst.at<float>(i) = -1.f;
349
int j, m = (int)labels.size();
351
int a = labels.front(), b = labels.back();
352
const int* currmap = &labels[0];
353
int hashval = ((unsigned)a*127 + (unsigned)b)*127 + m;
354
CatMapHash::iterator it = ofshash.find(hashval);
355
if( it != ofshash.end() )
358
Vec2i ofs0 = tempCatOfs[vi];
359
int m0 = ofs0[1] - ofs0[0];
360
const int* map0 = &tempCatMap[ofs0[0]];
361
if( m0 == m && map0[0] == a && map0[m0-1] == b )
363
for( j = 0; j < m; j++ )
364
if( map0[j] != currmap[j] )
369
tempCatOfs.push_back(ofs0);
375
ofshash[hashval] = i;
377
ofs[0] = (int)tempCatMap.size();
379
tempCatOfs.push_back(ofs);
380
std::copy(labels.begin(), labels.end(), std::back_inserter(tempCatMap));
384
tempCatOfs.push_back(Vec2i(0, 0));
385
/*Mat missing_i = layout == ROW_SAMPLE ? missing.col(i) : missing.row(i);
386
compare(missing_i, Scalar::all(0), non_missing, CMP_EQ);
387
missingSubst.at<float>(i) = (float)(mean(values_i, non_missing)[0]);*/
388
missingSubst.at<float>(i) = 0.f;
392
if( !tempCatOfs.empty() )
394
Mat(tempCatOfs).copyTo(catOfs);
395
Mat(tempCatMap).copyTo(catMap);
398
if( varType.at<uchar>(ninputvars) == VAR_CATEGORICAL )
400
preprocessCategorical(responses, &normCatResponses, labels, &counters, sortbuf);
401
Mat(labels).copyTo(classLabels);
402
Mat(counters).copyTo(classCounters);
406
Mat convertMaskToIdx(const Mat& mask)
408
int i, j, nz = countNonZero(mask), n = mask.cols + mask.rows - 1;
409
Mat idx(1, nz, CV_32S);
410
for( i = j = 0; i < n; i++ )
411
if( mask.at<uchar>(i) )
412
idx.at<int>(j++) = i;
418
CmpByIdx(const int* _data, int _step) : data(_data), step(_step) {}
419
bool operator ()(int i, int j) const { return data[i*step] < data[j*step]; }
424
void preprocessCategorical(const Mat& data, Mat* normdata, vector<int>& labels,
425
vector<int>* counters, vector<int>& sortbuf)
427
CV_Assert((data.cols == 1 || data.rows == 1) && (data.type() == CV_32S || data.type() == CV_32F));
433
normdata->create(data.size(), CV_32S);
434
odata = normdata->ptr<int>();
435
ostep = normdata->isContinuous() ? 1 : (int)normdata->step1();
438
int i, n = data.cols + data.rows - 1;
440
int* idx = &sortbuf[0];
441
int* idata = (int*)data.ptr<int>();
442
int istep = data.isContinuous() ? 1 : (int)data.step1();
444
if( data.type() == CV_32F )
447
const float* fdata = data.ptr<float>();
448
for( i = 0; i < n; i++ )
450
if( fdata[i*istep] == MISSED_VAL )
454
idata[i] = cvRound(fdata[i*istep]);
455
CV_Assert( (float)idata[i] == fdata[i*istep] );
461
for( i = 0; i < n; i++ )
464
std::sort(idx, idx + n, CmpByIdx(idata, istep));
467
for( i = 1; i < n; i++ )
468
clscount += idata[idx[i]*istep] != idata[idx[i-1]*istep];
471
int prev = ~idata[idx[0]*istep];
474
labels.resize(clscount);
476
counters->resize(clscount);
478
for( i = 0; i < n; i++ )
480
int l = idata[idx[i]*istep];
484
labels[clslabel] = l;
486
if( clslabel > 0 && counters )
487
counters->at(clslabel-1) = k;
492
odata[idx[i]*ostep] = clslabel;
495
counters->at(clslabel) = i - previdx;
498
bool loadCSV(const String& filename, int headerLines,
499
int responseStartIdx, int responseEndIdx,
500
const String& varTypeSpec, char delimiter, char missch)
502
const int M = 1000000;
503
const char delimiters[3] = { ' ', delimiter, '\0' };
505
bool varTypesSet = false;
509
file = fopen( filename.c_str(), "rt" );
514
std::vector<char> _buf(M);
515
std::vector<float> allresponses;
516
std::vector<float> rowvals;
517
std::vector<uchar> vtypes, rowtypes;
518
bool haveMissed = false;
519
char* buf = &_buf[0];
521
int i, ridx0 = responseStartIdx, ridx1 = responseEndIdx;
522
int ninputvars = 0, noutputvars = 0;
524
Mat tempSamples, tempMissing, tempResponses;
532
if( !fgets(buf, M, file) )
534
if(lineno < headerLines )
536
// trim trailing spaces
537
int idx = (int)strlen(buf)-1;
538
while( idx >= 0 && isspace(buf[idx]) )
540
// skip spaces in the beginning
542
while( *ptr != '\0' && isspace(*ptr) )
544
// skip commented off lines
550
char* token = strtok(buf, delimiters);
556
float val=0.f; int tp = 0;
557
decodeElem( token, val, tp, missch, tempNameMap, catCounter );
558
if( tp == VAR_MISSED )
560
rowvals.push_back(val);
561
rowtypes.push_back((uchar)tp);
562
token = strtok(NULL, delimiters);
569
if( rowvals.empty() )
570
CV_Error(CV_StsBadArg, "invalid CSV format; no data found");
571
nvars = (int)rowvals.size();
572
if( !varTypeSpec.empty() && varTypeSpec.size() > 0 )
574
setVarTypes(varTypeSpec, nvars, vtypes);
580
ridx0 = ridx0 >= 0 ? ridx0 : ridx0 == -1 ? nvars - 1 : -1;
581
ridx1 = ridx1 >= 0 ? ridx1 : ridx0 >= 0 ? ridx0+1 : -1;
582
CV_Assert(ridx1 > ridx0);
583
noutputvars = ridx0 >= 0 ? ridx1 - ridx0 : 0;
584
ninputvars = nvars - noutputvars;
587
CV_Assert( nvars == (int)rowvals.size() );
590
for( i = 0; i < nvars; i++ )
592
CV_Assert( (!varTypesSet && vtypes[i] == rowtypes[i]) ||
593
(varTypesSet && (vtypes[i] == rowtypes[i] || rowtypes[i] == VAR_ORDERED)) );
598
for( i = ridx1; i < nvars; i++ )
599
std::swap(rowvals[i], rowvals[i-noutputvars]);
600
for( i = ninputvars; i < nvars; i++ )
601
allresponses.push_back(rowvals[i]);
604
Mat rmat(1, ninputvars, CV_32F, &rowvals[0]);
605
tempSamples.push_back(rmat);
610
int nsamples = tempSamples.rows;
615
compare(tempSamples, MISSED_VAL, tempMissing, CMP_EQ);
619
for( i = ridx1; i < nvars; i++ )
620
std::swap(vtypes[i], vtypes[i-noutputvars]);
621
if( noutputvars > 1 )
623
for( i = ninputvars; i < nvars; i++ )
624
if( vtypes[i] == VAR_CATEGORICAL )
625
CV_Error(CV_StsBadArg,
626
"If responses are vector values, not scalars, they must be marked as ordered responses");
630
if( !varTypesSet && noutputvars == 1 && vtypes[ninputvars] == VAR_ORDERED )
632
for( i = 0; i < nsamples; i++ )
633
if( allresponses[i] != cvRound(allresponses[i]) )
636
vtypes[ninputvars] = VAR_CATEGORICAL;
639
//If there are responses in the csv file, save them. If not, responses matrix will contain just zeros
640
if (noutputvars != 0){
641
Mat(nsamples, noutputvars, CV_32F, &allresponses[0]).copyTo(tempResponses);
642
setData(tempSamples, ROW_SAMPLE, tempResponses, noArray(), noArray(),
643
noArray(), Mat(vtypes).clone(), tempMissing);
646
Mat zero_mat(nsamples, 1, CV_32F, Scalar(0));
647
zero_mat.copyTo(tempResponses);
648
setData(tempSamples, ROW_SAMPLE, tempResponses, noArray(), noArray(),
649
noArray(), noArray(), tempMissing);
651
bool ok = !samples.empty();
653
std::swap(tempNameMap, nameMap);
657
void decodeElem( const char* token, float& elem, int& type,
658
char missch, MapType& namemap, int& counter ) const
660
char* stopstring = NULL;
661
elem = (float)strtod( token, &stopstring );
662
if( *stopstring == missch && strlen(stopstring) == 1 ) // missed value
667
else if( *stopstring != '\0' )
669
MapType::iterator it = namemap.find(token);
670
if( it == namemap.end() )
672
elem = (float)counter;
673
namemap[token] = counter++;
676
elem = (float)it->second;
677
type = VAR_CATEGORICAL;
683
void setVarTypes( const String& s, int nvars, std::vector<uchar>& vtypes ) const
685
const char* errmsg = "type spec is not correct; it should have format \"cat\", \"ord\" or "
686
"\"ord[n1,n2-n3,n4-n5,...]cat[m1-m2,m3,m4-m5,...]\", where n's and m's are 0-based variable indices";
687
const char* str = s.c_str();
690
vtypes.resize(nvars);
692
for( int k = 0; k < 2; k++ )
694
const char* ptr = strstr(str, k == 0 ? "ord" : "cat");
695
int tp = k == 0 ? VAR_ORDERED : VAR_CATEGORICAL;
696
if( ptr ) // parse ord/cat str
698
char* stopstring = NULL;
702
for( int i = 0; i < nvars; i++ )
703
vtypes[i] = (uchar)tp;
709
CV_Error( CV_StsBadArg, errmsg );
711
ptr += 4; // pass "ord["
714
int b1 = (int)strtod( ptr, &stopstring );
715
if( *stopstring == 0 || (*stopstring != ',' && *stopstring != ']' && *stopstring != '-') )
716
CV_Error( CV_StsBadArg, errmsg );
717
ptr = stopstring + 1;
718
if( (stopstring[0] == ',') || (stopstring[0] == ']'))
720
CV_Assert( 0 <= b1 && b1 < nvars );
721
vtypes[b1] = (uchar)tp;
726
if( stopstring[0] == '-')
728
int b2 = (int)strtod( ptr, &stopstring);
729
if ( (*stopstring == 0) || (*stopstring != ',' && *stopstring != ']') )
730
CV_Error( CV_StsBadArg, errmsg );
731
ptr = stopstring + 1;
732
CV_Assert( 0 <= b1 && b1 <= b2 && b2 < nvars );
733
for (int i = b1; i <= b2; i++)
734
vtypes[i] = (uchar)tp;
735
specCounter += b2 - b1 + 1;
738
CV_Error( CV_StsBadArg, errmsg );
742
while(*stopstring != ']');
744
if( stopstring[1] != '\0' && stopstring[1] != ',')
745
CV_Error( CV_StsBadArg, errmsg );
749
if( specCounter != nvars )
750
CV_Error( CV_StsBadArg, "type of some variables is not specified" );
753
void setTrainTestSplitRatio(double ratio, bool shuffle)
755
CV_Assert( 0. <= ratio && ratio <= 1. );
756
setTrainTestSplit(cvRound(getNSamples()*ratio), shuffle);
759
void setTrainTestSplit(int count, bool shuffle)
761
int i, nsamples = getNSamples();
762
CV_Assert( 0 <= count && count < nsamples );
764
trainSampleIdx.release();
765
testSampleIdx.release();
768
trainSampleIdx = sampleIdx;
769
else if( count == nsamples )
770
testSampleIdx = sampleIdx;
773
Mat mask(1, nsamples, CV_8U);
774
uchar* mptr = mask.ptr();
775
for( i = 0; i < nsamples; i++ )
776
mptr[i] = (uchar)(i < count);
777
trainSampleIdx.create(1, count, CV_32S);
778
testSampleIdx.create(1, nsamples - count, CV_32S);
780
const int* sptr = !sampleIdx.empty() ? sampleIdx.ptr<int>() : 0;
781
int* trainptr = trainSampleIdx.ptr<int>();
782
int* testptr = testSampleIdx.ptr<int>();
783
for( i = 0; i < nsamples; i++ )
785
int idx = sptr ? sptr[i] : i;
787
trainptr[j0++] = idx;
796
void shuffleTrainTest()
798
if( !trainSampleIdx.empty() && !testSampleIdx.empty() )
800
int i, nsamples = getNSamples(), ntrain = getNTrainSamples(), ntest = getNTestSamples();
801
int* trainIdx = trainSampleIdx.ptr<int>();
802
int* testIdx = testSampleIdx.ptr<int>();
805
for( i = 0; i < nsamples; i++)
807
int a = rng.uniform(0, nsamples);
808
int b = rng.uniform(0, nsamples);
809
int* ptra = trainIdx;
810
int* ptrb = trainIdx;
815
CV_Assert( a < ntest );
821
CV_Assert( b < ntest );
823
std::swap(ptra[a], ptrb[b]);
828
Mat getTrainSamples(int _layout,
829
bool compressSamples,
830
bool compressVars) const
832
if( samples.empty() )
835
if( (!compressSamples || (trainSampleIdx.empty() && sampleIdx.empty())) &&
836
(!compressVars || varIdx.empty()) &&
840
int drows = getNTrainSamples(), dcols = getNVars();
841
Mat sidx = getTrainSampleIdx(), vidx = getVarIdx();
842
const float* src0 = samples.ptr<float>();
843
const int* sptr = !sidx.empty() ? sidx.ptr<int>() : 0;
844
const int* vptr = !vidx.empty() ? vidx.ptr<int>() : 0;
845
size_t sstep0 = samples.step/samples.elemSize();
846
size_t sstep = layout == ROW_SAMPLE ? sstep0 : 1;
847
size_t vstep = layout == ROW_SAMPLE ? 1 : sstep0;
849
if( _layout == COL_SAMPLE )
851
std::swap(drows, dcols);
852
std::swap(sptr, vptr);
853
std::swap(sstep, vstep);
856
Mat dsamples(drows, dcols, CV_32F);
858
for( int i = 0; i < drows; i++ )
860
const float* src = src0 + (sptr ? sptr[i] : i)*sstep;
861
float* dst = dsamples.ptr<float>(i);
863
for( int j = 0; j < dcols; j++ )
864
dst[j] = src[(vptr ? vptr[j] : j)*vstep];
870
void getValues( int vi, InputArray _sidx, float* values ) const
872
Mat sidx = _sidx.getMat();
873
int i, n = sidx.checkVector(1, CV_32S), nsamples = getNSamples();
874
CV_Assert( 0 <= vi && vi < getNAllVars() );
876
const int* s = n > 0 ? sidx.ptr<int>() : 0;
880
size_t step = samples.step/samples.elemSize();
881
size_t sstep = layout == ROW_SAMPLE ? step : 1;
882
size_t vstep = layout == ROW_SAMPLE ? 1 : step;
884
const float* src = samples.ptr<float>() + vi*vstep;
885
float subst = missingSubst.at<float>(vi);
886
for( i = 0; i < n; i++ )
892
CV_Assert( 0 <= j && j < nsamples );
894
values[i] = src[j*sstep];
895
if( values[i] == MISSED_VAL )
900
void getNormCatValues( int vi, InputArray _sidx, int* values ) const
902
float* fvalues = (float*)values;
903
getValues(vi, _sidx, fvalues);
904
int i, n = (int)_sidx.total();
905
Vec2i ofs = catOfs.at<Vec2i>(vi);
906
int m = ofs[1] - ofs[0];
908
CV_Assert( m > 0 ); // if m==0, vi is an ordered variable
909
const int* cmap = &catMap.at<int>(ofs[0]);
910
bool fastMap = (m == cmap[m - 1] - cmap[0] + 1);
914
for( i = 0; i < n; i++ )
916
int val = cvRound(fvalues[i]);
917
int idx = val - cmap[0];
918
CV_Assert(cmap[idx] == val);
924
for( i = 0; i < n; i++ )
926
int val = cvRound(fvalues[i]);
927
int a = 0, b = m, c = -1;
934
else if( val > cmap[c] )
940
CV_DbgAssert( c >= 0 && val == cmap[c] );
946
void getSample(InputArray _vidx, int sidx, float* buf) const
948
CV_Assert(buf != 0 && 0 <= sidx && sidx < getNSamples());
949
Mat vidx = _vidx.getMat();
950
int i, n = vidx.checkVector(1, CV_32S), nvars = getNAllVars();
952
const int* vptr = n > 0 ? vidx.ptr<int>() : 0;
956
size_t step = samples.step/samples.elemSize();
957
size_t sstep = layout == ROW_SAMPLE ? step : 1;
958
size_t vstep = layout == ROW_SAMPLE ? 1 : step;
960
const float* src = samples.ptr<float>() + sidx*sstep;
961
for( i = 0; i < n; i++ )
967
CV_Assert( 0 <= j && j < nvars );
969
buf[i] = src[j*vstep];
975
Mat samples, missing, varType, varIdx, responses, missingSubst;
976
Mat sampleIdx, trainSampleIdx, testSampleIdx;
977
Mat sampleWeights, catMap, catOfs;
978
Mat normCatResponses, classLabels, classCounters;
982
Ptr<TrainData> TrainData::loadFromCSV(const String& filename,
984
int responseStartIdx,
986
const String& varTypeSpec,
987
char delimiter, char missch)
989
Ptr<TrainDataImpl> td = makePtr<TrainDataImpl>();
990
if(!td->loadCSV(filename, headerLines, responseStartIdx, responseEndIdx, varTypeSpec, delimiter, missch))
995
Ptr<TrainData> TrainData::create(InputArray samples, int layout, InputArray responses,
996
InputArray varIdx, InputArray sampleIdx, InputArray sampleWeights,
999
Ptr<TrainDataImpl> td = makePtr<TrainDataImpl>();
1000
td->setData(samples, layout, responses, varIdx, sampleIdx, sampleWeights, varType, noArray());