~ubuntu-branches/ubuntu/trusty/r-cran-fastcluster/trusty-proposed

« back to all changes in this revision

Viewing changes to src/fastcluster_R.cpp

Committer: Package Import Robot
Author(s): Andreas Tille
Date: 2013-08-21 16:50:16 UTC
Revision ID: package-import@ubuntu.com-20130821165016-ysroljfdq290hv54

Tags: upstream-1.1.11

Import upstream version 1.1.11

files added:

DESCRIPTION

INSTALL

LICENSE

NAMESPACE

NEWS

R/fastcluster.R

README

inst

inst/CITATION

inst/doc

inst/doc/fastcluster.Rtex

inst/doc/fastcluster.pdf

man/fastcluster.Rd

man/hclust.Rd

man/hclust.vector.Rd

python

python/fastcluster.py

python/setup.py

python/test

python/test/nantest.py

python/test/test.py

python/test/vectortest.py

src/Makevars

src/Makevars.win

src/fastcluster.cpp

src/fastcluster_R.cpp

src/fastcluster_python.cpp

tests

tests/test_fastcluster.R

Show diffs side-by-side

added added

removed removed

src/fastcluster_R.cpp

fastcluster: Fast hierarchical clustering routines for R and Python

<http://math.stanford.edu/~muellner>

#if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))

#define HAVE_DIAGNOSTIC 1

#endif

#if HAVE_DIAGNOSTIC

#pragma GCC diagnostic push

#pragma GCC diagnostic ignored "-Wredundant-decls"

#pragma GCC diagnostic ignored "-Wpadded"

#endif

#include <Rdefines.h>

#include <R_ext/Rdynload.h>

#include <Rmath.h> // for R_pow

#if HAVE_DIAGNOSTIC

#pragma GCC diagnostic pop

#endif

#define fc_isnan(X) ((X)!=(X))

// There is ISNAN but it is so much slower on my x86_64 system with GCC!

#include <cstddef> // for std::ptrdiff_t

#include <limits> // for std::numeric_limits<...>::infinity()

#include <algorithm> // for std::stable_sort

#include <stdexcept> // for std::runtime_error

#include <string> // for std::string

#include <new> // for std::bad_alloc

#include <exception> // for std::exception

#include "fastcluster.cpp"

/* Since the public interface is given by the Python respectively R interface,

* we do not want other symbols than the interface initalization routines to be

* visible in the shared object file. The "visibility" switch is a GCC concept.

* Hiding symbols keeps the relocation table small and decreases startup time.

* See http://gcc.gnu.org/wiki/Visibility

#if HAVE_VISIBILITY

#pragma GCC visibility push(hidden)

#endif

Helper function: order the nodes so that they can be displayed nicely

in a dendrogram.

This is used for the 'order' field in the R output.

#if HAVE_DIAGNOSTIC

#pragma GCC diagnostic push

#pragma GCC diagnostic ignored "-Wpadded"

#endif

struct pos_node {

t_index pos;

int node;

};

#if HAVE_DIAGNOSTIC

#pragma GCC diagnostic pop

#endif

void order_nodes(const int N, const int * const merge, const t_index * const node_size, int * const order) {

/* Parameters:

N : number of data points

merge : (N-1)×2 array which specifies the node indices which are

merged in each step of the clustering procedure.

Negative entries -1...-N point to singleton nodes, while

positive entries 1...(N-1) point to nodes which are themselves

parents of other nodes.

node_size : array of node sizes - makes it easier

order : output array of size N

Runtime: Θ(N)

auto_array_ptr<pos_node> queue(N/2);

int parent;

int child;

t_index pos = 0;

queue[0].pos = 0;

queue[0].node = N-2;

t_index idx = 1;

do {

--idx;

pos = queue[idx].pos;

parent = queue[idx].node;

// First child

child = merge[parent];

if (child<0) { // singleton node, write this into the 'order' array.

order[pos] = -child;

++pos;

}

else { /* compound node: put it on top of the queue and decompose it

100

in a later iteration. */

101

queue[idx].pos = pos;

102

queue[idx].node = child-1; // convert index-1 based to index-0 based

103

++idx;

104

pos += node_size[child-1];

105

}

106

// Second child

107

child = merge[parent+N-1];

108

if (child<0) {

109

order[pos] = -child;

110

}

111

else {

112

queue[idx].pos = pos;

113

queue[idx].node = child-1;

114

++idx;

115

}

116

} while (idx>0);

117

}

118

119

#define size_(r_) ( ((r_<N) ? 1 : node_size[r_-N]) )

120

121

template <const bool sorted>

122

void generate_R_dendrogram(int * const merge, double * const height, int * const order, cluster_result & Z2, const int N) {

123

// The array "nodes" is a union-find data structure for the cluster

124

// identites (only needed for unsorted cluster_result input).

125

union_find nodes(sorted ? 0 : N);

126

if (!sorted) {

127

std::stable_sort(Z2[0], Z2[N-1]);

128

}

129

130

t_index node1, node2;

131

auto_array_ptr<t_index> node_size(N-1);

132

133

for (t_index i=0; i<N-1; ++i) {

134

// Get two data points whose clusters are merged in step i.

135

// Find the cluster identifiers for these points.

136

if (sorted) {

137

node1 = Z2[i]->node1;

138

node2 = Z2[i]->node2;

139

}

140

else {

141

node1 = nodes.Find(Z2[i]->node1);

142

node2 = nodes.Find(Z2[i]->node2);

143

// Merge the nodes in the union-find data structure by making them

144

// children of a new node.

145

nodes.Union(node1, node2);

146

}

147

// Sort the nodes in the output array.

148

if (node1>node2) {

149

t_index tmp = node1;

150

node1 = node2;

151

node2 = tmp;

152

}

153

/* Conversion between labeling conventions.

154

Input: singleton nodes 0,...,N-1

155

compound nodes N,...,2N-2

156

Output: singleton nodes -1,...,-N

157

compound nodes 1,...,N

158

159

merge[i] = (node1<N) ? -static_cast<int>(node1)-1

160

: static_cast<int>(node1)-N+1;

161

merge[i+N-1] = (node2<N) ? -static_cast<int>(node2)-1

162

: static_cast<int>(node2)-N+1;

163

height[i] = Z2[i]->dist;

164

node_size[i] = size_(node1) + size_(node2);

165

}

166

167

order_nodes(N, merge, node_size, order);

168

}

169

170

171

R interface code

172

173

174

enum {

175

METRIC_R_EUCLIDEAN = 0,

176

METRIC_R_MAXIMUM = 1,

177

METRIC_R_MANHATTAN = 2,

178

METRIC_R_CANBERRA = 3,

179

METRIC_R_BINARY = 4,

180

METRIC_R_MINKOWSKI = 5

181

};

182

183

#if HAVE_DIAGNOSTIC

184

#pragma GCC diagnostic push

185

#pragma GCC diagnostic ignored "-Wpadded"

186

#endif

187

class R_dissimilarity {

188

private:

189

t_float * Xa;

190

std::ptrdiff_t dim; // std::ptrdiff_t saves many statis_cast<> in products

191

t_float * members;

192

void (cluster_result::*postprocessfn) (const t_float) const;

193

t_float postprocessarg;

194

195

t_float (R_dissimilarity::*distfn) (const t_index, const t_index) const;

196

auto_array_ptr<t_index> row_repr;

197

int N;

198

199

// no default constructor

200

R_dissimilarity();

201

// noncopyable

202

R_dissimilarity(R_dissimilarity const &);

203

R_dissimilarity & operator=(R_dissimilarity const &);

204

205

public:

206

// Ignore warning about uninitialized member variables. I know what I am

207

// doing here, and some member variables are only used for certain metrics.

208

#if HAVE_DIAGNOSTIC

209

#pragma GCC diagnostic push

210

#pragma GCC diagnostic ignored "-Weffc++"

211

#endif

212

R_dissimilarity (t_float * const X_,

213

const int N_,

214

const int dim_,

215

t_float * const members_,

216

const unsigned char method,

217

const unsigned char metric,

218

const t_float p,

219

bool make_row_repr)

220

: Xa(X_),

221

dim(dim_),

222

members(members_),

223

postprocessfn(NULL),

224

postprocessarg(p),

225

N(N_)

226

{

227

switch (method) {

228

case METHOD_VECTOR_SINGLE:

229

switch (metric) {

230

case METRIC_R_EUCLIDEAN:

231

distfn = &R_dissimilarity::sqeuclidean<false>;

232

postprocessfn = &cluster_result::sqrt;

233

break;

234

case METRIC_R_MAXIMUM:

235

distfn = &R_dissimilarity::maximum;

236

break;

237

case METRIC_R_MANHATTAN:

238

distfn = &R_dissimilarity::manhattan;

239

break;

240

case METRIC_R_CANBERRA:

241

distfn = &R_dissimilarity::canberra;

242

break;

243

case METRIC_R_BINARY:

244

distfn = &R_dissimilarity::dist_binary;

245

break;

246

case METRIC_R_MINKOWSKI:

247

distfn = &R_dissimilarity::minkowski;

248

postprocessfn = &cluster_result::power;

249

break;

250

default:

251

throw std::runtime_error(std::string("Invalid method."));

252

}

253

break;

254

255

case METHOD_VECTOR_WARD:

256

postprocessfn = &cluster_result::sqrtdouble;

257

break;

258

259

default:

260

postprocessfn = &cluster_result::sqrt;

261

}

262

263

if (make_row_repr) {

264

row_repr.init(2*N-1);

265

for (t_index i=0; i<N; ++i) {

266

row_repr[i] = i;

267

}

268

}

269

270

}

271

#if HAVE_DIAGNOSTIC

272

#pragma GCC diagnostic pop

273

#endif

274

275

inline t_float operator () (const t_index i, const t_index j) const {

276

return (this->*distfn)(i,j);

277

}

278

279

inline t_float X (const t_index i, const t_index j) const {

280

// "C-style" array alignment

281

return Xa[i*dim+j];

282

}

283

284

inline t_float * Xptr(const t_index i, const t_index j) const {

285

// "C-style" array alignment

286

return Xa+i*dim+j;

287

}

288

289

void merge(const t_index i, const t_index j, const t_index newnode) const {

290

merge_inplace(row_repr[i], row_repr[j]);

291

row_repr[newnode] = row_repr[j];

292

}

293

294

void merge_inplace(const t_index i, const t_index j) const {

295

for(t_index k=0; k<dim; ++k) {

296

*(Xptr(j,k)) = (X(i,k)*members[i] + X(j,k)*members[j]) /

297

(members[i]+members[j]);

298

}

299

members[j] += members[i];

300

}

301

302

void merge_weighted(const t_index i, const t_index j, const t_index newnode) const {

303

merge_inplace_weighted(row_repr[i], row_repr[j]);

304

row_repr[newnode] = row_repr[j];

305

}

306

307

void merge_inplace_weighted(const t_index i, const t_index j) const {

308

t_float * const Pi = Xa+i*dim;

309

t_float * const Pj = Xa+j*dim;

310

for(t_index k=0; k<dim; ++k) {

311

Pj[k] = (Pi[k]+Pj[k])*.5;

312

}

313

}

314

315

void postprocess(cluster_result & Z2) const {

316

if (postprocessfn!=NULL) {

317

(Z2.*postprocessfn)(postprocessarg);

318

}

319

}

320

321

double ward(t_index const i1, t_index const i2) const {

322

return sqeuclidean<true>(i1,i2)*members[i1]*members[i2]/ \

323

(members[i1]+members[i2]);

324

}

325

326

inline double ward_initial(t_index const i1, t_index const i2) const {

327

/* In the R interface, ward_initial is the same as ward. Only the Python

328

interface has two different functions here. */

329

return ward(i1,i2);

330

}

331

332

// This method must not produce NaN if the input is non-NaN.

333

inline static t_float ward_initial_conversion(const t_float min) {

334

// identity

335

return min;

336

}

337

338

double ward_extended(t_index i1, t_index i2) const {

339

return ward(row_repr[i1], row_repr[i2]);

340

}

341

342

343

The following definitions and methods have been taken directly from

344

the R source file

345

346

/src/library/stats/src/distance.c

347

348

in the R release 2.13.0. The code has only been adapted very slightly.

349

(Unfortunately, the methods cannot be called directly in the R libraries

350

since the functions are declared "static" in the above file.)

351

352

Note to maintainers: If the code in distance.c changes in future R releases

353

compared to 2.13.0, please update the definitions here, if necessary.

354

355

356

// translation of variable names

357

#define nc dim

358

#define nr N

359

#define x Xa

360

#define p postprocessarg

361

362

// The code from distance.c starts here

363

364

#define both_FINITE(a,b) (R_FINITE(a) && R_FINITE(b))

365

#ifdef R_160_and_older

366

#define both_non_NA both_FINITE

367

#else

368

#define both_non_NA(a,b) (!ISNAN(a) && !ISNAN(b))

369

#endif

370

371

/* We need two variants of the Euclidean metric: one that does not check

372

for a NaN result, which is used for the initial distances, and one which

373

does, for the updated distances during the clustering procedure.

374

375

// still public

376

template <const bool check_NaN>

377

double sqeuclidean(t_index const i1, t_index const i2) const {

378

double dev, dist;

379

int count, j;

380

381

count = 0;

382

dist = 0;

383

double * p1 = x+i1*nc;

384

double * p2 = x+i2*nc;

385

for(j = 0 ; j < nc ; ++j) {

386

if(both_non_NA(*p1, *p2)) {

387

dev = (*p1 - *p2);

388

if(!ISNAN(dev)) {

389

dist += dev * dev;

390

++count;

391

}

392

}

393

++p1;

394

++p2;

395

}

396

if(count == 0) return NA_REAL;

397

if(count != nc) dist /= (static_cast<double>(count)/static_cast<double>(nc));

398

//return sqrt(dist);

399

// we take the square root later

400

if (check_NaN) {

401

#if HAVE_DIAGNOSTIC

402

#pragma GCC diagnostic push

403

#pragma GCC diagnostic ignored "-Wfloat-equal"

404

#endif

405

if (fc_isnan(dist))

406

throw(nan_error());

407

#if HAVE_DIAGNOSTIC

408

#pragma GCC diagnostic pop

409

#endif

410

}

411

return dist;

412

}

413

414

inline double sqeuclidean_extended(t_index const i1, t_index const i2) const {

415

return sqeuclidean<true>(row_repr[i1], row_repr[i2]);

416

}

417

418

private:

419

double maximum(t_index i1, t_index i2) const {

420

double dev, dist;

421

int count, j;

422

423

count = 0;

424

dist = -DBL_MAX;

425

double * p1 = x+i1*nc;

426

double * p2 = x+i2*nc;

427

for(j = 0 ; j < nc ; ++j) {

428

if(both_non_NA(*p1, *p2)) {

429

dev = fabs(*p1 - *p2);

430

if(!ISNAN(dev)) {

431

if(dev > dist)

432

dist = dev;

433

++count;

434

}

435

}

436

++p1;

437

++p2;

438

}

439

if(count == 0) return NA_REAL;

440

return dist;

441

}

442

443

double manhattan(t_index i1, t_index i2) const {

444

double dev, dist;

445

int count, j;

446

447

count = 0;

448

dist = 0;

449

double * p1 = x+i1*nc;

450

double * p2 = x+i2*nc;

451

for(j = 0 ; j < nc ; ++j) {

452

if(both_non_NA(*p1, *p2)) {

453

dev = fabs(*p1 - *p2);

454

if(!ISNAN(dev)) {

455

dist += dev;

456

++count;

457

}

458

}

459

++p1;

460

++p2;

461

}

462

if(count == 0) return NA_REAL;

463

if(count != nc) dist /= (static_cast<double>(count)/static_cast<double>(nc));

464

return dist;

465

}

466

467

double canberra(t_index i1, t_index i2) const {

468

double dev, dist, sum, diff;

469

int count, j;

470

471

count = 0;

472

dist = 0;

473

double * p1 = x+i1*nc;

474

double * p2 = x+i2*nc;

475

for(j = 0 ; j < nc ; ++j) {

476

if(both_non_NA(*p1, *p2)) {

477

sum = fabs(*p1 + *p2);

478

diff = fabs(*p1 - *p2);

479

if (sum > DBL_MIN || diff > DBL_MIN) {

480

dev = diff/sum;

481

#if HAVE_DIAGNOSTIC

482

#pragma GCC diagnostic push

483

#pragma GCC diagnostic ignored "-Wfloat-equal"

484

#endif

485

if(!ISNAN(dev) ||

486

(!R_FINITE(diff) && diff == sum &&

487

/* use Inf = lim x -> oo */ (dev = 1.))) {

488

dist += dev;

489

++count;

490

}

491

#if HAVE_DIAGNOSTIC

492

#pragma GCC diagnostic pop

493

#endif

494

}

495

}

496

++p1;

497

++p2;

498

}

499

if(count == 0) return NA_REAL;

500

if(count != nc) dist /= (static_cast<double>(count)/static_cast<double>(nc));

501

return dist;

502

}

503

504

double dist_binary(t_index i1, t_index i2) const {

505

int total, count, dist;

506

int j;

507

508

total = 0;

509

count = 0;

510

dist = 0;

511

double * p1 = x+i1*nc;

512

double * p2 = x+i2*nc;

513

for(j = 0 ; j < nc ; ++j) {

514

if(both_non_NA(*p1, *p2)) {

515

if(!both_FINITE(*p1, *p2)) {

516

// warning(_("treating non-finite values as NA"));

517

}

518

else {

519

#if HAVE_DIAGNOSTIC

520

#pragma GCC diagnostic push

521

#pragma GCC diagnostic ignored "-Wfloat-equal"

522

#endif

523

if(*p1 || *p2) {

524

++count;

525

if( ! (*p1 && *p2) ) {

526

++dist;

527

}

528

}

529

#if HAVE_DIAGNOSTIC

530

#pragma GCC diagnostic pop

531

#endif

532

++total;

533

}

534

}

535

++p1;

536

++p2;

537

}

538

539

if(total == 0) return NA_REAL;

540

if(count == 0) return 0;

541

return static_cast<double>(dist) / static_cast<double>(count);

542

}

543

544

double minkowski(t_index i1, t_index i2) const {

545

double dev, dist;

546

int count, j;

547

548

count= 0;

549

dist = 0;

550

double * p1 = x+i1*nc;

551

double * p2 = x+i2*nc;

552

for(j = 0 ; j < nc ; ++j) {

553

if(both_non_NA(*p1, *p2)) {

554

dev = (*p1 - *p2);

555

if(!ISNAN(dev)) {

556

dist += R_pow(fabs(dev), p);

557

++count;

558

}

559

}

560

++p1;

561

++p2;

562

}

563

if(count == 0) return NA_REAL;

564

if(count != nc) dist /= (static_cast<double>(count)/static_cast<double>(nc));

565

//return R_pow(dist, 1.0/p);

566

// raise to the (1/p)-th power later

567

return dist;

568

}

569

570

};

571

#if HAVE_DIAGNOSTIC

572

#pragma GCC diagnostic pop

573

#endif

574

575

extern "C" {

576

SEXP fastcluster(SEXP const N_, SEXP const method_, SEXP D_, SEXP members_) {

577

SEXP r = NULL; // return value

578

579

try{

580

581

Input checks

582

583

// Parameter N: number of data points

584

PROTECT(N_);

585

if (!IS_INTEGER(N_) || LENGTH(N_)!=1)

586

Rf_error("'N' must be a single integer.");

587

const int N = *INTEGER_POINTER(N_);

588

if (N<2)

589

Rf_error("N must be at least 2.");

590

const std::ptrdiff_t NN = static_cast<std::ptrdiff_t>(N)*(N-1)/2;

591

UNPROTECT(1); // N_

592

593

// Parameter method: dissimilarity index update method

594

PROTECT(method_);

595

if (!IS_INTEGER(method_) || LENGTH(method_)!=1)

596

Rf_error("'method' must be a single integer.");

597

const int method = *INTEGER_POINTER(method_) - 1; // index-0 based;

598

if (method<METHOD_METR_SINGLE || method>METHOD_METR_MEDIAN) {

599

Rf_error("Invalid method index.");

600

}

601

UNPROTECT(1); // method_

602

603

// Parameter members: number of members in each node

604

auto_array_ptr<t_float> members;

605

if (method==METHOD_METR_AVERAGE ||

606

method==METHOD_METR_WARD ||

607

method==METHOD_METR_CENTROID) {

608

members.init(N);

609

if (Rf_isNull(members_)) {

610

for (t_index i=0; i<N; ++i) members[i] = 1;

611

}

612

else {

613

PROTECT(members_ = AS_NUMERIC(members_));

614

if (LENGTH(members_)!=N)

615

Rf_error("'members' must have length N.");

616

const t_float * const m = NUMERIC_POINTER(members_);

617

for (t_index i=0; i<N; ++i) members[i] = m[i];

618

UNPROTECT(1); // members

619

}

620

}

621

622

// Parameter D_: dissimilarity matrix

623

PROTECT(D_ = AS_NUMERIC(D_));

624

if (LENGTH(D_)!=NN)

625

Rf_error("'D' must have length (N \\choose 2).");

626

const double * const D = NUMERIC_POINTER(D_);

627

// Make a working copy of the dissimilarity array

628

// for all methods except "single".

629

auto_array_ptr<double> D__;

630

if (method!=METHOD_METR_SINGLE) {

631

D__.init(NN);

632

for (std::ptrdiff_t i=0; i<NN; ++i)

633

D__[i] = D[i];

634

}

635

UNPROTECT(1); // D_

636

637

638

Clustering step

639

640

cluster_result Z2(N-1);

641

switch (method) {

642

case METHOD_METR_SINGLE:

643

MST_linkage_core(N, D, Z2);

644

break;

645

case METHOD_METR_COMPLETE:

646

NN_chain_core<METHOD_METR_COMPLETE, t_float>(N, D__, NULL, Z2);

647

break;

648

case METHOD_METR_AVERAGE:

649

NN_chain_core<METHOD_METR_AVERAGE, t_float>(N, D__, members, Z2);

650

break;

651

case METHOD_METR_WEIGHTED:

652

NN_chain_core<METHOD_METR_WEIGHTED, t_float>(N, D__, NULL, Z2);

653

break;

654

case METHOD_METR_WARD:

655

NN_chain_core<METHOD_METR_WARD, t_float>(N, D__, members, Z2);

656

break;

657

case METHOD_METR_CENTROID:

658

generic_linkage<METHOD_METR_CENTROID, t_float>(N, D__, members, Z2);

659

break;

660

case METHOD_METR_MEDIAN:

661

generic_linkage<METHOD_METR_MEDIAN, t_float>(N, D__, NULL, Z2);

662

break;

663

default:

664

throw std::runtime_error(std::string("Invalid method."));

665

}

666

667

D__.free(); // Free the memory now

668

members.free(); // (not strictly necessary).

669

670

SEXP m; // return field "merge"

671

PROTECT(m = NEW_INTEGER(2*(N-1)));

672

int * const merge = INTEGER_POINTER(m);

673

674

SEXP dim_m; // Specify that m is an (N-1)×2 matrix

675

PROTECT(dim_m = NEW_INTEGER(2));

676

INTEGER(dim_m)[0] = N-1;

677

INTEGER(dim_m)[1] = 2;

678

SET_DIM(m, dim_m);

679

680

SEXP h; // return field "height"

681

PROTECT(h = NEW_NUMERIC(N-1));

682

double * const height = NUMERIC_POINTER(h);

683

684

SEXP o; // return fiels "order'

685

PROTECT(o = NEW_INTEGER(N));

686

int * const order = INTEGER_POINTER(o);

687

688

if (method==METHOD_METR_CENTROID ||

689

method==METHOD_METR_MEDIAN)

690

generate_R_dendrogram<true>(merge, height, order, Z2, N);

691

else

692

generate_R_dendrogram<false>(merge, height, order, Z2, N);

693

694

SEXP n; // names

695

PROTECT(n = NEW_CHARACTER(3));

696

SET_STRING_ELT(n, 0, COPY_TO_USER_STRING("merge"));

697

SET_STRING_ELT(n, 1, COPY_TO_USER_STRING("height"));

698

SET_STRING_ELT(n, 2, COPY_TO_USER_STRING("order"));

699

700

PROTECT(r = NEW_LIST(3)); // field names in the output list

701

SET_ELEMENT(r, 0, m);

702

SET_ELEMENT(r, 1, h);

703

SET_ELEMENT(r, 2, o);

704

SET_NAMES(r, n);

705

706

UNPROTECT(6); // m, dim_m, h, o, r, n

707

} // try

708

catch (const std::bad_alloc&) {

709

Rf_error( "Memory overflow.");

710

}

711

catch(const std::exception& e){

712

Rf_error( e.what() );

713

}

714

catch(const nan_error&){

715

Rf_error("NaN dissimilarity value.");

716

}

717

#ifdef FE_INVALID

718

catch(const fenv_error&){

719

Rf_error( "NaN dissimilarity value in intermediate results.");

720

}

721

#endif

722

catch(...){

723

Rf_error( "C++ exception (unknown reason)." );

724

}

725

726

return r;

727

}

728

729

SEXP fastcluster_vector(SEXP const method_,

730

SEXP const metric_,

731

SEXP X_,

732

SEXP members_,

733

SEXP p_) {

734

SEXP r = NULL; // return value

735

try{

736

737

738

Input checks

739

740

741

// Parameter method: dissimilarity index update method

742

PROTECT(method_);

743

if (!IS_INTEGER(method_) || LENGTH(method_)!=1)

744

Rf_error("'method' must be a single integer.");

745

int method = *INTEGER_POINTER(method_) - 1; // index-0 based;

746

if (method<METHOD_VECTOR_SINGLE || method>METHOD_VECTOR_MEDIAN) {

747

Rf_error("Invalid method index.");

748

}

749

UNPROTECT(1); // method_

750

751

// Parameter metric

752

PROTECT(metric_);

753

if (!IS_INTEGER(metric_) || LENGTH(metric_)!=1)

754

Rf_error("'metric' must be a single integer.");

755

int metric = *INTEGER_POINTER(metric_) - 1; // index-0 based;

756

if (metric<0 || metric>5 ||

757

(method!=METHOD_VECTOR_SINGLE && metric!=0) ) {

758

Rf_error("Invalid metric index.");

759

}

760

UNPROTECT(1); // metric_

761

762

// data array

763

PROTECT(X_ = AS_NUMERIC(X_));

764

SEXP dims_ = PROTECT( Rf_getAttrib( X_, R_DimSymbol ) ) ;

765

if( dims_ == R_NilValue || LENGTH(dims_) != 2 ) {

766

Rf_error( "Argument is not a matrix.");

767

}

768

const int * const dims = INTEGER(dims_);

769

const int N = dims[0];

770

const int dim = dims[1];

771

if (N<2)

772

Rf_error("There must be at least two data points.");

773

// Make a working copy of the dissimilarity array

774

// for all methods except "single".

775

double * X__ = NUMERIC_POINTER(X_);

776

// Copy the input array and change it from Fortran-contiguous style

777

// to C-contiguous style

778

// (Waste of memory for 'single'; the other methods need a copy

779

auto_array_ptr<double> X(LENGTH(X_));

780

for (std::ptrdiff_t i=0; i<N; ++i)

781

for (std::ptrdiff_t j=0; j<dim; ++j)

782

X[i*dim+j] = X__[i+j*N];

783

784

UNPROTECT(2); // dims_, X_

785

786

// Parameter members: number of members in each node

787

auto_array_ptr<t_float> members;

788

if (method==METHOD_VECTOR_WARD ||

789

method==METHOD_VECTOR_CENTROID) {

790

members.init(N);

791

if (Rf_isNull(members_)) {

792

for (t_index i=0; i<N; ++i) members[i] = 1;

793

}

794

else {

795

PROTECT(members_ = AS_NUMERIC(members_));

796

if (LENGTH(members_)!=N)

797

Rf_error("The length of 'members' must be the same as the number of data points.");

798

const t_float * const m = NUMERIC_POINTER(members_);

799

for (t_index i=0; i<N; ++i) members[i] = m[i];

800

UNPROTECT(1); // members

801

}

802

}

803

804

// Parameter p

805

PROTECT(p_);

806

double p = 0;

807

if (metric==METRIC_R_MINKOWSKI) {

808

if (!IS_NUMERIC(p_) || LENGTH(p_)!=1)

809

Rf_error("'p' must be a single floating point number.");

810

p = *NUMERIC_POINTER(p_);

811

}

812

else {

813

if (p_ != R_NilValue) {

814

Rf_error("No metric except 'minkowski' allows a 'p' parameter.");

815

}

816

}

817

UNPROTECT(1); // p_

818

819

/* The generic_linkage_vector_alternative algorithm uses labels

820

N,N+1,... for the new nodes, so we need a table which node is

821

stored in which row.

822

823

Instructions: Set this variable to true for all methods which

824

use the generic_linkage_vector_alternative algorithm below.

825

826

bool make_row_repr =

827

(method==METHOD_VECTOR_CENTROID || method==METHOD_VECTOR_MEDIAN);

828

829

R_dissimilarity dist(X, N, dim, members,

830

static_cast<unsigned char>(method),

831

static_cast<unsigned char>(metric),

832

833

make_row_repr);

834

cluster_result Z2(N-1);

835

836

837

Clustering step

838

839

switch (method) {

840

case METHOD_VECTOR_SINGLE:

841

MST_linkage_core_vector(N, dist, Z2);

842

break;

843

844

case METHOD_VECTOR_WARD:

845

generic_linkage_vector<METHOD_METR_WARD>(N, dist, Z2);

846

break;

847

848

case METHOD_VECTOR_CENTROID:

849

generic_linkage_vector_alternative<METHOD_METR_CENTROID>(N, dist, Z2);

850

break;

851

852

case METHOD_VECTOR_MEDIAN:

853

generic_linkage_vector_alternative<METHOD_METR_MEDIAN>(N, dist, Z2);

854

break;

855

856

default:

857

throw std::runtime_error(std::string("Invalid method."));

858

}

859

860

X.free(); // Free the memory now

861

members.free(); // (not strictly necessary).

862

863

dist.postprocess(Z2);

864

865

SEXP m; // return field "merge"

866

PROTECT(m = NEW_INTEGER(2*(N-1)));

867

int * const merge = INTEGER_POINTER(m);

868

869

SEXP dim_m; // Specify that m is an (N-1)×2 matrix

870

PROTECT(dim_m = NEW_INTEGER(2));

871

INTEGER(dim_m)[0] = N-1;

872

INTEGER(dim_m)[1] = 2;

873

SET_DIM(m, dim_m);

874

875

SEXP h; // return field "height"

876

PROTECT(h = NEW_NUMERIC(N-1));

877

double * const height = NUMERIC_POINTER(h);

878

879

SEXP o; // return fiels "order'

880

PROTECT(o = NEW_INTEGER(N));

881

int * const order = INTEGER_POINTER(o);

882

883

if (method==METHOD_VECTOR_SINGLE)

884

generate_R_dendrogram<false>(merge, height, order, Z2, N);

885

else

886

generate_R_dendrogram<true>(merge, height, order, Z2, N);

887

888

SEXP n; // names

889

PROTECT(n = NEW_CHARACTER(3));

890

SET_STRING_ELT(n, 0, COPY_TO_USER_STRING("merge"));

891

SET_STRING_ELT(n, 1, COPY_TO_USER_STRING("height"));

892

SET_STRING_ELT(n, 2, COPY_TO_USER_STRING("order"));

893

894

PROTECT(r = NEW_LIST(3)); // field names in the output list

895

SET_ELEMENT(r, 0, m);

896

SET_ELEMENT(r, 1, h);

897

SET_ELEMENT(r, 2, o);

898

SET_NAMES(r, n);

899

900

UNPROTECT(6); // m, dim_m, h, o, r, n

901

} // try

902

catch (const std::bad_alloc&) {

903

Rf_error( "Memory overflow.");

904

}

905

catch(const std::exception& e){

906

Rf_error( e.what() );

907

}

908

catch(const nan_error&){

909

Rf_error("NaN dissimilarity value.");

910

}

911

catch(...){

912

Rf_error( "C++ exception (unknown reason)." );

913

}

914

915

return r;

916

}

917

918

#if HAVE_VISIBILITY

919

#pragma GCC visibility push(default)

920

#endif

921

void R_init_fastcluster(DllInfo * const info)

922

{

923

R_CallMethodDef callMethods[] = {

924

{"fastcluster", (DL_FUNC) &fastcluster, 4},

925

{"fastcluster_vector", (DL_FUNC) &fastcluster_vector, 5},

926

{NULL, NULL, 0}

927

};

928

R_registerRoutines(info, NULL, callMethods, NULL, NULL);

929

}

930

#if HAVE_VISIBILITY

931

#pragma GCC visibility pop

932

#endif

933

934

} // extern "C"

935

936

#if HAVE_VISIBILITY

937

#pragma GCC visibility pop

938

#endif

Older »