~ubuntu-branches/ubuntu/lucid/bioperl/lucid

into the GFF-format recognized by Bio::DB::GFF. If the resulting GFF-files are loaded into a Bio::DB:GFF database using the utilities described below, the annotation can be viewed in the Generic Genome Browser (http://www.gmod.org/ggb/) and interfaced with using the Bio::DB:GFF libraries.

(NB these NCBI-datafiles are dumps from their own mapviewer database backend, according to their READMEs)

To produce the GFF-files, download all the chr*sequence.gz files from the FTP-directory above. While in that same directory, run the following example command (see also help clause by running script with no arguments):

process_ncbi_human.pl --locuslink [path to LL.out_hs.gz] chr*sequence.gz

This will unzip all the files on the fly and open an output file with

the name chrom[$chrom]_ncbiannotation.gff for each, read the LocusLink

records into an in-memory hash and then read through the NCBI feature

lines, lookup 'locus' features in the LocusLink hash for details on

'locus' features and print to the proper GFF files. LL.out_hs.gz is

accessible here at the time of writing:

ftp://ftp.ncbi.nih.gov/refseq/LocusLink/LL.out_hs.gz

Note that several of the NCBI features are skipped from the

reformatting, either because their nature is not fully known at this

time (TAG,GS_TRAN) or their sheer volume stands in the way of them

being accessibly in Bio::DB::GFF at this time (EST similarities). You

can easily change this by modifying the $SKIP variable to your liking

to add or remove features, but if you add then you will have to add

handling for those new features.

To bulk-import the GFF-files into a Bio::DB::GFF database, use the

bulk_load_gff.pl utility provided with Bio::DB::GFF

=head2 AUTHOR

Gudmundur Arni Thorisson E<lt>mummi@cshl.orgE<gt>

This code is free software; you can redistribute it

and/or modify it under the same terms as Perl itself.

=cut

use strict;

use Getopt::Long;

use IO::File;

use Bio::DB::GFF::Util::Binning 'bin';

use File::Basename;

my $self = basename($0);

my ($doTSCSNP,$doLocuslink,$debug);

my $opt = &GetOptions ('locuslink=s' => \$doLocuslink,

'tscsnp=s' => \$doTSCSNP,

'debug=s' => \$debug,

);

die <<USAGE if(!defined($opt) || @ARGV == 0);

Usage: $self [options] <GFF filename or wildcard pattern>

Massage NCBI chromosome annotation datafiles into GFF-format suitable for importing into Bio::DB::GFF database. Note that the program handles both unzipped datafiles and gzipped, bzipped or compressed ones, so do not bother with unzipping big downloads before running.

See 'perldoc $self' for more info

Options:

--locuslink Path to zipped LocusLink file, currently located at

ftp://ftp.ncbi.nih.gov/refseq/LocusLink/LL.out_hs.gz

used to lookup gene description and official symbols

--tscsnp DSN string to TSC MySQL database to use for auxiliary

SNP feature attributes (CSHL internal use)

--debug Enable debugging output for the DBI database driver.

(CSHL internal use)

Options can be abbreviated. For example, you can use -l for

--locuslink.

Author: Gudmundur Arni Thorisson <mummi\@cshl.org>

This library is free software; you can redistribute it

and/or modify it under the same terms as Perl itself.

USAGE

;

#Prepare decompression streams for input files, if necessary

my %FH;

100

print "\nPreparing input and output streams:\n";

101

foreach (@ARGV) {

102

my $chrom=basename($_); # NG 02-10-24

103

($chrom) = $chrom=~/0?([0-9,XYxy]{1,2})/; # NG 02-10-24

104

unless($chrom)

105

{

106

print "can't get chrom name from filename '$_', SKIPPING";

107

next;

108

}

109

$FH{'Chr'.$chrom} = IO::File->new("chrom$chrom\_ncbiannotation.gff",">") or die $_,": $!";

110

$_ = "gunzip -c $_ |" if /\.gz$/;

111

$_ = "uncompress -c $_ |" if /\.Z$/;

112

$_ = "bunzip2 -c $_ |" if /\.bz2$/;

113

}

114

115

#If TSC SNP processing is to be performed, connect to db and prepare query

116

my $dbh;

117

my $tsc_sth;

118

if($doTSCSNP)

119

{

120

#Is this an argstring or file with the string in it?

121

my $dbistring = -f $doTSCSNP ? `cat $doTSCSNP` : $doTSCSNP;

122

$dbh = &dbConnect($dbistring);

123

$dbh->trace($debug) if $debug;

124

print "\nConnecting to TSC database using '$dbistring'\n";

125

my $query = qq/

126

SELECT ta.snp_id,

127

ta.variation,

128

tr.locus_id,

129

tr.gene_symbol,

130

tr.fxn_class,

131

tf.institute_code as lab,

132

tf.pop_type,

133

tf.outcome,

134

tf.pooled_data,

135

tf.num_people_typed,

136

tf.allele_a_freq as A,

137

tf.allele_c_freq as C,

138

tf.allele_g_freq as G,

139

tf.allele_t_freq as T

140

FROM tbl_dbsnp_2_tsc dbsnp

141

LEFT JOIN tbl_refsnp_gene_fxn tr on tr.refsnp_id=dbsnp.rs_id

142

LEFT JOIN tbl_allele_freq tf on tf.dbsnp_id=dbsnp.ss_id

143

LEFT JOIN TBL_SNP_ALL ta on ta.snp_id = dbsnp.tsc_id

144

WHERE dbsnp.rs_id = ? limit 1/;

145

$tsc_sth = $dbh->prepare($query);

146

}

147

148

#If Locuslink-processing is to be performed, Read

149

#previously cached data structure from disk

150

my $llData;

151

152

153

if($doLocuslink)

154

{

155

$doLocuslink = "gunzip -c $doLocuslink |" if $doLocuslink =~ /\.gz$/;

156

$doLocuslink = "uncompress -c $doLocuslink |" if $doLocuslink =~ /\.Z$/;

157

$doLocuslink = "bunzip -c $doLocuslink |" if $doLocuslink =~ /\.bz$/;

158

open LL,$doLocuslink || die $!;

159

my $l = 0;

160

while(<LL>)

161

{

162

$l++;

163

print "\r--$l LocusLink records loaded" if $l % 100 ==0;

164

my ($id,$osym,$isym,$mim,$chrom,$loc,$desc,$taxid,$db) = split /\t/;

165

my $name = $osym || $isym;

166

#print " Loading in Locuslink id='$id',osym='$osym',isym='$isym',name='$name',desc='$desc'\n";

167

$llData->{$id}->{name} = $name;

168

$llData->{$id}->{isym} = $isym;

169

$llData->{$id}->{mim} = $mim;

170

$llData->{$id}->{chrom}= $chrom;

171

$llData->{$id}->{loc} = $loc;

172

$llData->{$id}->{desc} = $desc;

173

$llData->{$id}->{taxid}= $taxid;

174

$llData->{$id}->{db} = $db;

175

}

176

close LL;

177

}

178

179

180

my %sources = (snp => 'dbSNP',

181

sts => 'UniSTS',

182

locus => 'LocusLink',

183

transcript => 'RefSeq',

184

transcript_mouse => 'RefSeq',

185

transcript_human => 'RefSeq',

186

component => 'Genbank',

187

contig => 'RefSeq',

188

tag => 'SAGE',

189

gs_tran => 'GenomeScan',

190

);

191

my %classes = (component => 'Sequence',

192

sts => 'STS',

193

snp => 'SNP',

194

locus => 'Locus',

195

transcript => 'Transcript',

196

transcript_human => 'Transcript',

197

transcript_mouse => 'Transcript',

198

contig => 'Contig',

199

clone => 'Clone',

200

tag => 'SAGE_tag',

201

gs_tran => 'Transcript',

202

);

203

204

my %subcomponents = (transcript => 'exon',

205

transcript_human => 'exon',

206

transcript_mouse => 'exon',

207

gs_tran => 'exon',

208

locus => 'exon',

209

component => 'subcomponent',

210

);

211

212

213

#And now process all incoming data streams

214

my $i = 0;

215

my %maxCoords;

216

my $SKIP = q/^EST|component|clone/;

217

my %groups = (); #aggregate parent features

218

my %density = ();

219

my $binSize = 100000;

220

my $max = 0;

221

print "\nStarting main loop:\n";

222

while(<>)

223

{

224

chomp;

225

next if /^\#/;

226

my ($type,$objId,$name,$chrom,$start,$stop,$strand) = split "\t";

227

my ($class,$source);

228

my $score = '.';

229

$strand = '.' unless $strand =~ m/^(\+|\-)$/;

230

$type = lc $type;

231

if($type eq 'gs_tran')

232

{

233

$type = 'transcript';

234

$source = 'GenomeScan';

235

}

236

elsif($type eq 'est_human' && $name =~ /^NM_/)

237

{

238

$type = 'transcript';

239

$source = 'RefSeq-human';

240

}

241

elsif($type eq 'est_mouse' && $name =~ /^NM_/)

242

{

243

$type = 'transcript';

244

$source ='RefSeq-mouse';

245

}

246

next if $type =~ /$SKIP/i;

247

$i++;

248

#my ($chrom,$ctg) = split /\|/,$chromctg;

249

next if $chrom =~ /NT/; #ambigously placed NT-contig at start of chrom

250

$chrom = "Chr$chrom";

251

$max = $stop if $stop > $max;

252

$class ||= $classes{$type};

253

unless($class)

254

{

255

print "need class for type '$type': '$_' (OR add type to \$SKIP pattern\n";

256

next;

257

}

258

my $method = $type;

259

$source ||= $sources{$type} || die "ERROR: need source for type '$type'";

260

$objId = $name if $type =~ /transcript|snp|contig|gs_tran|tag/;

261

my $attributes = qq/$class $objId/;

262

$attributes .= qq/; Name $name/ unless $objId eq $name;

263

my $bin = &bin($start,$stop,$binSize);

264

$bin =~ s/^[10]+\.[0]+//;

265

$bin ||= 0;

266

#print "\$bin='$bin' ($start=>$stop)\n";

267

$density{$chrom}->{$method.'_dens:'.$source}->{$bin}++;

268

269

#Deduce start/stop for certain parent features to be printed

270

#to output file AFTER we've processed everything. This is

271

#necessary because NCBI only gives start/stop values for the child

272

#features, like exons in a gene, but not the whole parent feature

273

if($type =~ /transcript|locus/)

274

{

275

$groups{$type}->{$objId}->{$chrom}->{name} = $name;

276

$groups{$type}->{$objId}->{$chrom}->{start} ||= 9999999999999;

277

$groups{$type}->{$objId}->{$chrom}->{stop} ||= 0;

278

$groups{$type}->{$objId}->{$chrom}->{start} = $start

279

if $start < $groups{$type}->{$objId}->{$chrom}->{start};

280

$groups{$type}->{$objId}->{$chrom}->{stop} = $stop

281

if $stop > $groups{$type}->{$objId}->{$chrom}->{stop};

282

$groups{$type}->{$objId}->{$chrom}->{source} = $source;

283

$groups{$type}->{$objId}->{$chrom}->{strand} = $strand;

284

$groups{$type}->{$objId}->{$chrom}->{method} = $method;

285

$groups{$type}->{$objId}->{$chrom}->{class} = $class;

286

$method = $subcomponents{$type};

287

next if $type eq 'locus';

288

}

289

#This is for internal CSHL usage

290

elsif($type =~ /snp/ && $doTSCSNP)

291

{

292

#print " -got refSNP ID: $name, let's do TSC lookup\n";

293

if(my $tscAttributes = &queryTSCdb($dbh,$name))

294

{

295

#$FH{$chrom}->print(qq/$chrom\tTSC\tsnp\t$start\t$stop\t.\t$strand\t.\t$tscAttributes\n/);

296

$attributes .= $tscAttributes;

297

#print "\$attributes='$attributes'\n";

298

}

299

}

300

301

#Trying to work around the contig pile-up at the start of a chromosome

302

if($method eq 'contig' && $stop == 0)

303

{

304

print STDERR "SKIPPING, contig '$name' as stop = $stop and start = $start.\n";

305

}

306

307

#And finally print to the proper output stream

308

$FH{$chrom}->print(qq/$chrom\t$source\t$method\t$start\t$stop\t.\t$strand\t$score\t$attributes\n/);

309

310

#Collect max coordinates, to deduce chromosome sizes

311

$maxCoords{$chrom} ||= 0;

312

$maxCoords{$chrom} = $stop if $stop > $maxCoords{$chrom};

313

314

#Progress indicator

315

if ( $i % 1000 == 0)

316

{

317

my ($chrom) = $ARGV=~ /0?([0-9,XYxy]{1,2})/;

318

print STDERR "$i total features parsed. Now doing chromosome $chrom";

319

print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";

320

}

321

}#MAIN LOOP ENDS

322

323

324

#Print out group features like transcripts and genes that

325

#were collected before and print to the proper output streams

326

print "\nPrinting out collected aggregate features\n";

327

foreach my $type(keys %groups)

328

{

329

foreach my $objId (keys %{$groups{$type}})

330

{

331

#print "\$name='$name'\n";

332

foreach my $chrom(keys %{$groups{$type}->{$objId}})

333

{

334

my $name = $groups{$type}->{$objId}->{$chrom}->{name};

335

my $start = $groups{$type}->{$objId}->{$chrom}->{start};

336

my $stop = $groups{$type}->{$objId}->{$chrom}->{stop};

337

my $strand = $groups{$type}->{$objId}->{$chrom}->{strand};

338

my $method = $groups{$type}->{$objId}->{$chrom}->{method};

339

my $class = $groups{$type}->{$objId}->{$chrom}->{class};

340

my $source = $groups{$type}->{$objId}->{$chrom}->{source};

341

if($type eq 'locus' && $doLocuslink)

342

{

343

my $llInfo = '';

344

my $ll = $llData->{$objId};

345

my $id = $ll->{id};

346

my $note = $ll->{desc} ? qq/Note "$name:$ll->{desc}"/ : ' ';

347

$note =~ s/;/\\;/g;

348

$FH{$chrom}->print( qq/$chrom\t$source\t$method\t$start\t$stop\t.\t$strand\t.\tLocus $objId/);

349

$FH{$chrom}->print(qq/; Name $name/) unless $objId eq $name;

350

$FH{$chrom}->print(qq/; $note\n/);

351

}

352

else

353

{

354

$FH{$chrom}->print(qq/$chrom\t$source\t$method\t$start\t$stop\t.\t$strand\t.\t$class $name; Name $name\n/);

355

}

356

}

357

}

358

}

359

360

# $density{$method.'_dens:'.$source}->{$bin}++;

361

#Print out the collected binned density stats

362

print "Printing out density stats\n";

363

364

foreach my $chrom(sort keys %density)

365

{

366

foreach my $meth(sort keys %{$density{$chrom}})

367

{

368

my $bc = 0;

369

foreach my $bin(sort {$a<=>$b}keys %{$density{$chrom}->{$meth}})

370

{

371

$bc++;

372

my $count = $density{$chrom}->{$meth}->{$bin};

373

my $binstart = $bin*$binSize;

374

my $binstop = $binstart+$binSize;

375

print " \$bin=$bin,\$binstart=$binstart,\$count=$count\n";

376

$FH{$chrom}->print(qq/$chrom\tNCBI\t$meth\t$binstart\t$binstop\t$count\t.\t.\t\n/);

377

}

378

print " $bc bins for method $meth\n";

379

}

380

}

381

382

#Print a line for the reference sequences themselves

383

while(my ($chrom,$max) = each %maxCoords)

384

{

385

$FH{$chrom}->print(qq/$chrom\tassembly\tchromosome\t1\t$max\t.\t+\t.\tSequence \"$chrom\"\n/);

386

}

387

388

print "\nDONE. $i features parsed\n\n";

389

390

#------------------------------------------------

391

# Subroutines

392

#------------------------------------------------

393

394

#For internal CSHL use. Queries our inhouse MySQL database with

395

#SNP Consortium data for various auxiliary data on some SNPs

396

sub queryTSCdb

397

{

398

my $dbh = shift;

399

my $rs_id = shift;

400

my $attributes;

401

$rs_id =~ s/rs//;

402

403

#Baeat vid herna, na i classification string fra dbSNP

404

my ($note,$tsc_id,$var,$lab,$dbsnp_id,$class,$gene_symbol,$locus_id,$freq);

405

$tsc_sth->execute($rs_id) || die $@;

406

my $tscInfo = $tsc_sth->fetchrow_hashref() || return undef;

407

do{

408

#while(my($k,$v) = each %$tscInfo){print " '$k'=>'$v'\n";}

409

#dbSNP stuff, may apply to more than just TSC snps

410

$locus_id = $tscInfo->{locus_id};

411

$class = $tscInfo->{fxn_class};

412

$gene_symbol = $tscInfo->{gene_symbol};

413

$attributes .= qq/; SNPClass $class/ if $class;

414

415

#TSC specific stuff

416

$tsc_id = $tscInfo->{snp_id} || return $attributes;

417

$tsc_id = sprintf("TSC%7.7d", $tsc_id);

418

$var = lc $tscInfo->{variation};

419

$lab = $tscInfo->{institute_code};

420

$dbsnp_id = $tscInfo->{dbsnp_id};

421

$freq = 1 if $tscInfo->{pop_type} && $tscInfo->{outcome} eq 'S';

422

$attributes .= qq/; Alias $tsc_id/;

423

#$attributes .= qq/; Variation $var/ if $var;

424

$attributes .= qq/; AllFreq 1/ if $freq;

425

}while($tscInfo = $tsc_sth->fetchrow_hashref());

426

$note = qq/; Note "$tsc_id($var)"/;

427

$rs_id = $dbh=$tsc_id=$var=$lab=$dbsnp_id=$class=$gene_symbol=$locus_id= $tscInfo=$freq= undef;

428

return $attributes.$note;

429

}

430

431

sub dbConnect

432

{

433

my $dsn = shift;

434

my $dbh;

435

use DBI;

436

eval{$dbh = DBI->connect($dsn,

437

{

438

RaiseError => 1,

439

FetchHashKeyName => 'NAME_lc',

440

}

441

)

442

};

443

if($@ || !$dbh)

444

{

445

print STDERR "ERROR, cannot connect to DB! $@\n";

446

die $DBI::errstr;

447

}

448

return $dbh;

449

}

450

Older »