~bertrand-nouvel/pycvf-core/main

« back to all changes in this revision

Viewing changes to trunk/pycvfext/indexes/experiences/near_duplicate_detection.py

Committer: Bertrand Nouvel
Date: 2010-06-07 02:10:56 UTC
mfrom: (153.1.4 pycvf-tranx)
Revision ID: tranx@jfli-nii-station1-20100607021056-4tlojqz68plylscb

post merge commit

files added:
trunk/pycvf/databases/label.py

trunk/pycvfext/demos/schemes.sh

trunk/pycvfext/graph/nodes

trunk/pycvfext/graph/wrappers

trunk/pycvfext/indexes/experiences

trunk/pycvfext/indexes/experiences/bruteforce_ratio.py

trunk/pycvfext/indexes/experiences/near_duplicate_detection.py

trunk/pycvfext/indexes/tests

trunk/pycvfext/indexes/tests/test_index.py

trunk/pycvfext/segmentation/lib/reindexed.pyx

files modified:
trunk/pycvf/apps/run_experiment.py

trunk/pycvf/core/experiment.py

trunk/pycvf/core/settings.py

trunk/pycvf/databases/from_trackfile.py

trunk/pycvf/lib/IR.py

trunk/pycvf/nodes/decompose.py

trunk/pycvf/schemes/leadfft.py

trunk/pycvfext/indexes/applications/build_index.py

trunk/pycvfext/indexes/applications/nearestneighbors_qtapp.py

trunk/pycvfext/indexes/indexes/load_index.py

trunk/pycvfext/rctree/indexes/rct_index.py

Show diffs side-by-side

added added

removed removed

trunk/pycvfext/indexes/experiences/near_duplicate_detection.py

#!/bin/bash

# -*- coding: utf-8 -*-

## Here we aim at testing and comparing the performance of different indexing technologies

import sys, os, time, pickle

from pycvf.core.errors import *

from pycvf.core.experiment import Experiment

import numpy, sys, time,os,gc

import cPickle as pickle

from pycvf.core import genericnode

from pycvf.lib.info import pmap

from pycvf.datatypes import image

from pycvf.core.builders import *

from pycvf.indexes.pseudoincremental import PseudoIncrementalIndex

from pycvf.indexes.load_index import CachedIndex

from pycvf.indexes.sashindex import SashIndex

from pycvf.databases.limit import DB as limit

from pycvf.databases.randomized import DB as randomized

from pycvf.nodes.free import Model as free

from pycvf.nodes.vectorset.index_build import Model as index_build

from pycvf.nodes.vectorset.index_query import Model as index_query

from pycvf.apps.model_run import MdlRunner

import matplotlib

import pylab as p

#import matplotlib.axes3d as p3

import mpl_toolkits.mplot3d.axes3d as p3

def pycvf_model_run(*args,**kwargs):

class C(MdlRunner):

pass

return C.call(*args,**kwargs)

###

### We need to create different database for near duplicate we have

###

# image / image+impairement

# shots / shots+impairement (time offset in video)

# aloi

# zubud

dbwang="""aggregated_database([

transformed(limit(classical_db('WANG'),{l}),LN('image.deteriorate.noise')),

transformed(limit(classical_db('WANG'),{l}),LN('image.deteriorate.jpegcompress')),

transformed(limit(classical_db('WANG'),{l}),LN('image.deteriorate.blackframe')),

transformed(limit(classical_db('WANG'),{l}),LN('image.deteriorate.randwarp')),

])"""

DATABASES_AND_LABELS={

'ZuBuD25':(

"traindb(limit(classical_db('SZuBuD25'),{l}),0.8)",

"testdb(limit(classical_db('SZuBuD25'),{l}),0.8)",

("default", "lambda x:x[0]")

'ZuBuD50':(

"traindb(limit(classical_db('SZuBuD50'),{l}),0.8)",

"testdb(limit(classical_db('SZuBuD50'),{l}),0.8)",

("default", "lambda x:x[0]")

'ZuBuD101':(

"traindb(limit(classical_db('SZuBuD101'),{l}),0.8)",

"testdb(limit(classical_db('SZuBuD101'),{l}),0.8)",

("default", "lambda x:x[0]")

'ZuBuD202':(

"traindb(limit(classical_db('SZuBuD202'),{l}),0.8)",

"testdb(limit(classical_db('SZuBuD202'),{l}),0.8)",

("default", "lambda x:x[0]")

'ALOI12':(

"traindb(limit(classical_db('ALOIILL_red4'),12*12),0.8)",

"testdb(limit(classical_db('ALOIILL_red4'),12*12),0.8)",

("default", "lambda x:x[0]")

'ALOI24':(

"traindb(limit(classical_db('ALOIILL_red4'),24*12),0.8)",

"testdb(limit(classical_db('ALOIILL_red4'),24*12),0.8)",

("default", "lambda x:x[0]")

'WANG12':(

"traindb(%s,0.8)"%(dbwang.format(l="12"),),

"testdb(%s,0.8)"%(dbwang.format(l="12"),),

("default", "lambda x:x[0]")

100

101

'WANG25':(

102

"traindb(%s,0.8)"%(dbwang.format(l="25"),),

103

"testdb(%s,0.8)"%(dbwang.format(l="25"),),

104

("default", "lambda x:x[0]")

105

106

107

'WANG50':(

108

"traindb(%s,0.8)"%(dbwang.format(l="50"),),

109

"testdb(%s,0.8)"%(dbwang.format(l="50"),),

110

("default", "lambda x:x[0]")

111

112

113

'WANG100':(

114

"traindb(%s,0.8)"%(dbwang.format(l="100"),),

115

"testdb(%s,0.8)"%(dbwang.format(l="100"),),

116

("default", "lambda x:x[0]")

117

118

119

'WANG200':(

120

"traindb(%s,0.8)"%(dbwang.format(l="200"),),

121

"testdb(%s,0.8)"%(dbwang.format(l="200"),),

122

("default", "lambda x:x[0]")

123

124

125

'WANG400':(

126

"traindb(%s,0.8)"%(dbwang.format(l="400"),),

127

"testdb(%s,0.8)"%(dbwang.format(l="400"),),

128

("default", "lambda x:x[0]")

129

130

131

#(traindb(),testdb())

132

}

133

134

135

FEATURES={

136

'CM':'image.descriptors.CM()',

137

'HOG':'image.descriptors.HOG()',

138

'GIST':'image.descriptors.GIST()',

139

'LBP':'image.descriptors.LBP()',

140

}

141

142

class MyExperiment(Experiment):

143

"""

144

This will generate the experiment code for us..

145

"""

146

c=0

147

expe="default"

148

class Parameters0:

149

150

## Which regression model is to be used

151

idxclass=[ 'SashIndex', 'CvLSH', 'RCDist' ]#, 'PoullotIndex' ] #, 'krr', 'multinomial' ] ##

152

database=DATABASES_AND_LABELS.keys()

153

feature=FEATURES.keys()

154

155

156

157

158

def setup(self,P0):

159

### nice names for the parameters

160

print "****",P0

161

idxclass=P0['idxclass']

162

feature=P0['feature']

163

database=P0['database']

164

pickle.dump(P0,file(os.path.join(self.directory,"xparam-%d.pcl")%(self.c,),"w"))

165

166

167

168

169

## Compute Features on the database.... Some of these database may be really larger, and we currently focus

170

## on rather small amount so we do compute feature files for declared database and not for pre-existing one.

171

## this choice of course depends of the situation...

172

173

## on this occasion we also compute the ground-truth if it has not yet been computed ...

174

from pycvf.core import builders

175

db1=builders.database_builder(DATABASES_AND_LABELS[database][1])

176

db2=builders.database_builder(DATABASES_AND_LABELS[database][0])

177

ldb1=getattr(db1,"labeling_"+DATABASES_AND_LABELS[database][2][0])()

178

ldb2=getattr(db1,"labeling_"+DATABASES_AND_LABELS[database][2][0])()

179

lot=DATABASES_AND_LABELS[database][2][1]

180

lo=eval(DATABASES_AND_LABELS[database][2][1])

181

182

## ############################################################################################################0

183

## enumerate the classes....

184

## ############################################################################################################0

185

186

187

try:

188

C=pickle.load(file("%s/categs-%s.pcl"%(self.directory,database,),"rb"))

189

pickle.load(file("%s/addresses-%s.pcl"%(self.directory,database,),"rb"))

190

except:

191

C=set()

192

A=set()

193

for t1 in db1.keys():

194

A.add(t1)

195

if (lo(ldb1[t1]) not in C):

196

C.add(lo(ldb1[t1]))

197

for t2 in db2.keys():

198

A.add(t2)

199

if (lo(ldb2[t2]) not in C):

200

C.add(lo(ldb2[t2]))

201

pickle.dump(C,file("%s/categs-%s.pcl"%(self.directory,database,),"wb"))

202

pickle.dump(A,file("%s/addresses-%s.pcl"%(self.directory,database,),"wb"))

203

del A

204

205

## ############################################################################################################0

206

## compute groundtruth....

207

## ############################################################################################################0

208

try:

209

gt=pickle.load(file("%s/ground-truth-%s.pcl"%(self.directory,database,),"rb"))

210

except:

211

gt=[]

212

for t1 in db1.keys():

213

#print t1,":",

214

rx=[]

215

for t2 in db2.keys():

216

if (lo(ldb1[t1])==lo(ldb2[t2])):

217

#print t2,

218

rx.append(t2)

219

gt.append((t1,rx))

220

pickle.dump(gt,file("%s/ground-truth-%s.pcl"%(self.directory,database,),"wb"))

221

222

try:

223

gt=pickle.load(file("%s/tt-ground-truth-%s.pcl"%(self.directory,database,),"rb"))

224

except:

225

gt=[]

226

for t1 in db1.keys():

227

#print t1,":",

228

rx=[]

229

for t2 in db1.keys():

230

if (lo(ldb1[t1])==lo(ldb1[t2])):

231

#print t2,

232

rx.append(t2)

233

gt.append((t1,rx))

234

pickle.dump(gt,file("%s/tt-ground-truth-%s.pcl"%(self.directory,database,),"wb"))

235

236

237

try:

238

gt=pickle.load(file("%s/aa-ground-truth-%s.pcl"%(self.directory,database,),"rb"))

239

except:

240

gt=[]

241

for t1 in db1.keys():

242

rx=[]

243

for t2 in db1.keys():

244

if (lo(ldb1[t1])==lo(ldb1[t2])):

245

rx.append(t2)

246

for t2 in db2.keys():

247

if (lo(ldb1[t1])==lo(ldb1[t2])):

248

rx.append(t2)

249

gt.append((t1,rx))

250

pickle.dump(gt,file("%s/aa-ground-truth-%s.pcl"%(self.directory,database,),"wb"))

251

252

253

## ############################################################################################################0

254

## compute features if necessary....

255

## ############################################################################################################0

256

try:

257

os.stat("%s/%s-%s-train-0000.mfa"%(self.directory,database,feature))

258

except:

259

print ' '.join(("pycvf_compute_features",

260

"pycvf_compute_features",

261

"--db", DATABASES_AND_LABELS[database][0],

262

"-m", FEATURES[feature],

263

"-t", "%s/%s-%s-train"%(self.directory,database,feature),

264

"-p", "8"))

265

r=os.spawnlp(os.P_WAIT,"pycvf_compute_features",

266

"pycvf_compute_features",

267

"--db", DATABASES_AND_LABELS[database][0],

268

"-m", FEATURES[feature],

269

"-t", "%s/%s-%s-train"%(self.directory,database,feature),

270

"-p", "8")

271

print r

272

assert(r==0)

273

try:

274

os.stat("%s/%s-%s-test-0000.mfa"%(self.directory,database,feature))

275

except:

276

print ' '.join(["pycvf_compute_features",

277

"pycvf_compute_features",

278

"--db", DATABASES_AND_LABELS[database][1],

279

"-m", FEATURES[feature],

280

"-t", "%s/%s-%s-test"%(self.directory,database,feature),

281

"-p", "8"])

282

r=os.spawnlp(os.P_WAIT,"pycvf_compute_features",

283

"pycvf_compute_features",

284

"--db", DATABASES_AND_LABELS[database][1],

285

"-m", FEATURES[feature],

286

"-t", "%s/%s-%s-test"%(self.directory,database,feature),

287

"-p", "8")

288

print r

289

assert(r==0)

290

291

292

## ############################################################################################################0

293

## build an index on train features....

294

## ############################################################################################################0

295

296

297

try:

298

os.stat("%s/"%(self.directory,database,feature))

299

except:

300

print "Building Index"

301

r=os.spawnlp(os.P_WAIT,"pycvf_build_index",

302

"pycvf_build_index",

303

"--db", "from_trackfile('%s/%s-%s-train')"%(self.directory,database,feature),

304

"--idxpath", "%s/%s-%s-train-idx")

305

print r

306

assert(r==0)

307

308

309

## ############################################################################################################0

310

## train a classifier to decide which pair of features are actually corresponding to same clip or not....

311

## ############################################################################################################0

312

#positive_db=pickle.load("tt-ground-truth-%s.pcl")

313

314

## positive_couple_db : getdbitem.getdbitem()

315

## aggregated_database ( [ positive_couples_db , negative_couples_db ] )

316

317

318

#random_couple_db = "randomized(productdb(from_list(map(lambda x:(x,x),pickle.load(file('%s/categs-%s.pcl','rb')))),from_list(map(lambda x:(x,x),pickle.load(file('%s/categs-%s.pcl','rb'))))))"%(self.directory,database,self.directory,database,)

319

random_couple_db = "randomized(productdb(from_list(map(lambda x:(x,x),pickle.load(file('%s/categs-%s.pcl','rb')))),from_list(map(lambda x:(x,x),pickle.load(file('%s/categs-%s.pcl','rb'))))))"%(self.directory,database,self.directory,database,)

320

321

positive_couple_db_train= "transformed(exploded(from_list(pickle.load(file('%s/tt-ground-truth-%s.pcl','rb'))),LS('list').DefaultStructure()),LN('free','(((%s)(thisnode.get_curaddr())),x)'))"%(self.directory,database,lot)

322

positive_couple_db_test = "transformed(exploded(from_list(pickle.load(file('%s/ground-truth-%s.pcl','rb'))),LS('list').DefaultStructure()),LN('free', '(((%s)(thisnode.get_curaddr())),x)'))"%(self.directory,database,lot)

323

324

negative_couple_db_train= "transformed(exploded(transformed(from_list(pickle.load(file('%s/aa-ground-truth-%s.pcl','rb'))), free('list(pickle.load(file(\"%s/addresses-%s.pcl\",\"rb\"))-set(x))')), LS('list').DefaultStructure()),LN('free','(((%s)(thisnode.get_curaddr())),x)'))"%(self.directory,database,self.directory,database,lot)

325

negative_couple_db_test = "transformed(exploded(transformed(from_list(pickle.load(file('%s/aa-ground-truth-%s.pcl','rb'))), free('list(pickle.load(file(\"%s/addresses-%s.pcl\",\"rb\"))-set(x))') ), LS('list').DefaultStructure()),LN('free','(((%s)(thisnode.get_curaddr())),x)'))"%(self.directory,database,self.directory,database,lot)

326

327

#print random_couple_db

328

#print positive_couple_db_train

329

#print negative_couple_db_train

330

#assert(False)

331

#reduce(lambda b,y: b+map(lambda x:(y[0],x) ,y[1]), pickle.load(file("ground-truth-ALOI12.pcl")),[])

332

333

334

#pycvf_dbshow --db "transformed(exploded(transformed(from_list(pickle.load(file('near_duplicate_detection-std/tt-ground-truth-ALOI12.pcl','rb'))),free('list(pickle.load(file(\"near_duplicate_detection-std/addresses-ALOI12.pcl\",\"rb\"))-set(x))')),LS('list').DefaultStructure()),LN('free','(((lambda x:x[0])(thisnode.get_curaddr()))[1],x[0])'))"

335

336

#negative_couple_db= "excludedb(positive_couple_db,random_couple_db)"

337

#print negative_couple_db_train

338

### assert(False)

339

340

dbexpr_train="randomized(labeled_databases_from_list(dict([(1, (limit.DB({positive_couple_db},1000))),(-1,(limit.DB({negative_couple_db},1000)))])))".format(

341

positive_couple_db=positive_couple_db_train,

342

negative_couple_db=negative_couple_db_train

343

)

344

345

346

dbexpr_test="randomized(labeled_databases_from_list(dict([(1, (limit.DB({positive_couple_db},1000))),(-1,(limit.DB({negative_couple_db},1000)))])))".format(

347

positive_couple_db=positive_couple_db_test,

348

negative_couple_db=negative_couple_db_test

349

)

350

351

#print positive_couple_db_train

352

#print negative_couple_db_train

353

#print dbexpr_train

354

355

dbtrain=DATABASES_AND_LABELS[database][0]

356

dbtest=DATABASES_AND_LABELS[database][1]

357

358

#os.spawnlp(os.P_WAIT,"pycvf_dbshow",

359

# "pycvf_dbshow",

360

# #"-A","1",

361

# "--db", "%s"%(dbexpr_train,)

362

# )

363

364

#print "transformed(%s,naive())"%(dbexpr_train,) # dbtrain,naive()

365

#print "randomized(transformed({db},free('(pycvf.core.builders.database_builder(\"%s\")[x[0]], pycvf.core.builders.database_builder(\"%s\")[x[1]])'%(\"{db1}\",\"{db2}\"))))-free('x[0]')".format(db=dbexpr_train,db1=dbtrain,db2=dbtrain)

366

367

#r=os.spawnlp(os.P_WAIT,"pycvf_dbshow",

368

#"pycvf_dbshow",

369

#"-A","1",

370

#"--db", "randomized(transformed({db},free('(pycvf.core.builders.database_builder(\"%s\")[x[0]], pycvf.core.builders.database_builder(\"%s\")[x[1]])'%(\"{db1}\",\"{db2}\"))-LN('free','x',datatype=DTP('image'))))".format(db=dbexpr_train,db1=dbtrain,db2=dbtrain) #dbtrain,)

371

372

373

374

375

print "randomized(transformed({db},free('(pycvf.core.builders.database_builder(\"%s\")[x[0]], pycvf.core.builders.database_builder(\"%s\")[x[1]])'%(\"{db1}\",\"{db2}\"))-LN('free','x',datatype=DTP('image'))))".format(db=dbexpr_train,db1="from_trackfile(%s/%s-%s-train)"%(self.directory,database,feature),db2="from_trackfile('%s/%s-%s-train')"%(self.directory,database,feature))

376

377

r=os.spawnlp(os.P_WAIT,"pycvf_dbshow",

378

"pycvf_dbshow",

379

"-A","1",

380

"--db", "randomized(transformed({db},free('(pycvf.core.builders.database_builder(\"merged(from_trackfile(\\\\\"%s\\\\\"),from_trackfile(\\\\\"%s\\\\\"))\")[x[0]], pycvf.core.builders.database_builder(\"merged(from_trackfile(\\\\\"%s\\\\\"),from_trackfile(\\\\\"%s\\\\\"))\")[x[1]])'%(\"{db1a}\",\"{db1b}\",\"{db2a}\",\"{db2b}\"))-LN('free','x',datatype=DTP('image'))))".format(db=dbexpr_train,db1a="%s/%s-%s-train"%(self.directory,database,feature),db1b="%s/%s-%s-test"%(self.directory,database,feature),db2a="%s/%s-%s-train"%(self.directory,database,feature),db2b="%s/%s-%s-test"%(self.directory,database,feature)) #dbtrain,)

381

)

382

383

384

dbtest_cpl="randomized(transformed({db},free('(pycvf.core.builders.database_builder(\"merged(from_trackfile(\\\\\"%s\\\\\"),from_trackfile(\\\\\"%s\\\\\"))\")[x[0]], pycvf.core.builders.database_builder(\"merged(from_trackfile(\\\\\"%s\\\\\"),from_trackfile(\\\\\"%s\\\\\"))\")[x[1]])'%(\"{db1a}\",\"{db1b}\",\"{db2a}\",\"{db2b}\"))-LN('free','x',datatype=DTP('image'))))".format(db=dbexpr_test,db1a="%s/%s-%s-train"%(self.directory,database,feature),db1b="%s/%s-%s-test"%(self.directory,database,feature),db2a="%s/%s-%s-train"%(self.directory,database,feature),db2b="%s/%s-%s-test"%(self.directory,database,feature))

385

386

#assert(False)

387

#r=os.spawnlp(os.P_WAIT,"pycvf_model_run",

388

# "pycvf_model_run",

389

# "--db", dbexpr_train,

390

# "-m", "vectorset.train_classification_and_output_model(ML('CLS.weka_bridge','adaboost1'),'xxx')",

391

# )

392

393

394

#print r

395

396

#assert(False)

397

# sys.exit(0)

398

399

400

## ############################################################################################################0

401

## we actually are not needing to create pairs for ....

402

## ############################################################################################################0

403

404

405

406

## ############################################################################################################0

407

## we are ready to evaluate our performance based on our setting based on training

408

## ############################################################################################################0

409

410

# retrieve features in the track file according to the element indexes...

411

#dbtotal=("transformed(model=LN('getdbitem',from_trackfile('%s')),db=transformed(model=LN('address'),db=caltech256({nelemperclass},{numclasses})))"%(feature[0],))

412

#dbtotal=dbtotal.format(**P0)

413

#seed=random.random()

414

#dbtrain="traindb(%s,0.5,%f,)"%(dbtotal,seed,)

415

#dbtest="testdb(%s,0.5,%f)"%(dbtotal,seed,)

416

return {'dbtest_cpl':dbtest_cpl,

417

}

418

419

def eval(self,**kwargs):

420

print "~~~~~~~~~TESTING~~~~~~~~~"

421

422

print " ".join([

423

"pycvf_model_run",

424

"--db", kwargs['dbtest_cpl'],

425

"-m", "vectorset.train_classification_and_output_model(ML('CLS.weka_bridge','adaboost1'))-free('x.save(\"titi\")')",

426

])

427

428

429

r=os.spawnlp(os.P_WAIT,"pycvf_model_run",

430

"pycvf_model_run",

431

"--db", kwargs['dbtest_cpl'],

432

"-m", "vectorset.train_classification_and_output_model(ML('CLS.weka_bridge','adaboost1'))-free('x.save(\"titi\")')",

433

)

434

435

assert(r==0)

436

print "/~~~~~~~~~TESTING~~~~~~~~~"

437

438

return 0

439

440

441

442

def displayresults(self,):

443

print "xx"

444

Older »