5
ncbi_2_gff.pl - Massage NCBI chromosome annotation into GFF-format suitable for Bio::DB::GFF
7
=head1 VERSION (CVS-info)
9
$RCSfile: process_ncbi_human.PLS,v $
12
$Date: 2003/02/24 17:44:04 $
17
perl process_ncbi_human.pl [options] /path/to/gzipped/datafile(s)
21
This script massages the chromosome annotation files located at
23
ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/maps/mapview/chromosome_order/
25
into the GFF-format recognized by Bio::DB::GFF. If the resulting GFF-files are loaded into a Bio::DB:GFF database using the utilities described below, the annotation can be viewed in the Generic Genome Browser (http://www.gmod.org/ggb/) and interfaced with using the Bio::DB:GFF libraries.
26
(NB these NCBI-datafiles are dumps from their own mapviewer database backend, according to their READMEs)
28
To produce the GFF-files, download all the chr*sequence.gz files from the FTP-directory above. While in that same directory, run the following example command (see also help clause by running script with no arguments):
30
process_ncbi_human.pl --locuslink [path to LL.out_hs.gz] chr*sequence.gz
32
This will unzip all the files on the fly and open an output file with
33
the name chrom[$chrom]_ncbiannotation.gff for each, read the LocusLink
34
records into an in-memory hash and then read through the NCBI feature
35
lines, lookup 'locus' features in the LocusLink hash for details on
36
'locus' features and print to the proper GFF files. LL.out_hs.gz is
37
accessible here at the time of writing:
39
ftp://ftp.ncbi.nih.gov/refseq/LocusLink/LL.out_hs.gz
41
Note that several of the NCBI features are skipped from the
42
reformatting, either because their nature is not fully known at this
43
time (TAG,GS_TRAN) or their sheer volume stands in the way of them
44
being accessibly in Bio::DB::GFF at this time (EST similarities). You
45
can easily change this by modifying the $SKIP variable to your liking
46
to add or remove features, but if you add then you will have to add
47
handling for those new features.
49
To bulk-import the GFF-files into a Bio::DB::GFF database, use the
50
bulk_load_gff.pl utility provided with Bio::DB::GFF
54
Gudmundur Arni Thorisson E<lt>mummi@cshl.orgE<gt>
56
Copyright (c) 2002 Cold Spring Harbor Laboratory
58
This code is free software; you can redistribute it
59
and/or modify it under the same terms as Perl itself.
66
use Bio::DB::GFF::Util::Binning 'bin';
69
my $self = basename($0);
70
my ($doTSCSNP,$doLocuslink,$debug);
71
my $opt = &GetOptions ('locuslink=s' => \$doLocuslink,
72
'tscsnp=s' => \$doTSCSNP,
75
die <<USAGE if(!defined($opt) || @ARGV == 0);
76
Usage: $self [options] <GFF filename or wildcard pattern>
77
Massage NCBI chromosome annotation datafiles into GFF-format suitable for importing into Bio::DB::GFF database. Note that the program handles both unzipped datafiles and gzipped, bzipped or compressed ones, so do not bother with unzipping big downloads before running.
78
See 'perldoc $self' for more info
80
--locuslink Path to zipped LocusLink file, currently located at
81
ftp://ftp.ncbi.nih.gov/refseq/LocusLink/LL.out_hs.gz
82
used to lookup gene description and official symbols
83
--tscsnp DSN string to TSC MySQL database to use for auxiliary
84
SNP feature attributes (CSHL internal use)
85
--debug Enable debugging output for the DBI database driver.
88
Options can be abbreviated. For example, you can use -l for
90
Author: Gudmundur Arni Thorisson <mummi\@cshl.org>
91
Copyright (c) 2002 Cold Spring Harbor Laboratory
92
This library is free software; you can redistribute it
93
and/or modify it under the same terms as Perl itself.
98
#Prepare decompression streams for input files, if necessary
100
print "\nPreparing input and output streams:\n";
102
my $chrom=basename($_); # NG 02-10-24
103
($chrom) = $chrom=~/0?([0-9,XYxy]{1,2})/; # NG 02-10-24
106
print "can't get chrom name from filename '$_', SKIPPING";
109
$FH{'Chr'.$chrom} = IO::File->new("chrom$chrom\_ncbiannotation.gff",">") or die $_,": $!";
110
$_ = "gunzip -c $_ |" if /\.gz$/;
111
$_ = "uncompress -c $_ |" if /\.Z$/;
112
$_ = "bunzip2 -c $_ |" if /\.bz2$/;
115
#If TSC SNP processing is to be performed, connect to db and prepare query
120
#Is this an argstring or file with the string in it?
121
my $dbistring = -f $doTSCSNP ? `cat $doTSCSNP` : $doTSCSNP;
122
$dbh = &dbConnect($dbistring);
123
$dbh->trace($debug) if $debug;
124
print "\nConnecting to TSC database using '$dbistring'\n";
131
tf.institute_code as lab,
136
tf.allele_a_freq as A,
137
tf.allele_c_freq as C,
138
tf.allele_g_freq as G,
139
tf.allele_t_freq as T
140
FROM tbl_dbsnp_2_tsc dbsnp
141
LEFT JOIN tbl_refsnp_gene_fxn tr on tr.refsnp_id=dbsnp.rs_id
142
LEFT JOIN tbl_allele_freq tf on tf.dbsnp_id=dbsnp.ss_id
143
LEFT JOIN TBL_SNP_ALL ta on ta.snp_id = dbsnp.tsc_id
144
WHERE dbsnp.rs_id = ? limit 1/;
145
$tsc_sth = $dbh->prepare($query);
148
#If Locuslink-processing is to be performed, Read
149
#previously cached data structure from disk
155
$doLocuslink = "gunzip -c $doLocuslink |" if $doLocuslink =~ /\.gz$/;
156
$doLocuslink = "uncompress -c $doLocuslink |" if $doLocuslink =~ /\.Z$/;
157
$doLocuslink = "bunzip -c $doLocuslink |" if $doLocuslink =~ /\.bz$/;
158
open LL,$doLocuslink || die $!;
163
print "\r--$l LocusLink records loaded" if $l % 100 ==0;
164
my ($id,$osym,$isym,$mim,$chrom,$loc,$desc,$taxid,$db) = split /\t/;
165
my $name = $osym || $isym;
166
#print " Loading in Locuslink id='$id',osym='$osym',isym='$isym',name='$name',desc='$desc'\n";
167
$llData->{$id}->{name} = $name;
168
$llData->{$id}->{isym} = $isym;
169
$llData->{$id}->{mim} = $mim;
170
$llData->{$id}->{chrom}= $chrom;
171
$llData->{$id}->{loc} = $loc;
172
$llData->{$id}->{desc} = $desc;
173
$llData->{$id}->{taxid}= $taxid;
174
$llData->{$id}->{db} = $db;
180
my %sources = (snp => 'dbSNP',
182
locus => 'LocusLink',
183
transcript => 'RefSeq',
184
transcript_mouse => 'RefSeq',
185
transcript_human => 'RefSeq',
186
component => 'Genbank',
189
gs_tran => 'GenomeScan',
191
my %classes = (component => 'Sequence',
195
transcript => 'Transcript',
196
transcript_human => 'Transcript',
197
transcript_mouse => 'Transcript',
201
gs_tran => 'Transcript',
204
my %subcomponents = (transcript => 'exon',
205
transcript_human => 'exon',
206
transcript_mouse => 'exon',
209
component => 'subcomponent',
213
#And now process all incoming data streams
216
my $SKIP = q/^EST|component|clone/;
217
my %groups = (); #aggregate parent features
219
my $binSize = 100000;
221
print "\nStarting main loop:\n";
226
my ($type,$objId,$name,$chrom,$start,$stop,$strand) = split "\t";
229
$strand = '.' unless $strand =~ m/^(\+|\-)$/;
231
if($type eq 'gs_tran')
233
$type = 'transcript';
234
$source = 'GenomeScan';
236
elsif($type eq 'est_human' && $name =~ /^NM_/)
238
$type = 'transcript';
239
$source = 'RefSeq-human';
241
elsif($type eq 'est_mouse' && $name =~ /^NM_/)
243
$type = 'transcript';
244
$source ='RefSeq-mouse';
246
next if $type =~ /$SKIP/i;
248
#my ($chrom,$ctg) = split /\|/,$chromctg;
249
next if $chrom =~ /NT/; #ambigously placed NT-contig at start of chrom
250
$chrom = "Chr$chrom";
251
$max = $stop if $stop > $max;
252
$class ||= $classes{$type};
255
print "need class for type '$type': '$_' (OR add type to \$SKIP pattern\n";
259
$source ||= $sources{$type} || die "ERROR: need source for type '$type'";
260
$objId = $name if $type =~ /transcript|snp|contig|gs_tran|tag/;
261
my $attributes = qq/$class $objId/;
262
$attributes .= qq/; Name $name/ unless $objId eq $name;
263
my $bin = &bin($start,$stop,$binSize);
264
$bin =~ s/^[10]+\.[0]+//;
266
#print "\$bin='$bin' ($start=>$stop)\n";
267
$density{$chrom}->{$method.'_dens:'.$source}->{$bin}++;
269
#Deduce start/stop for certain parent features to be printed
270
#to output file AFTER we've processed everything. This is
271
#necessary because NCBI only gives start/stop values for the child
272
#features, like exons in a gene, but not the whole parent feature
273
if($type =~ /transcript|locus/)
275
$groups{$type}->{$objId}->{$chrom}->{name} = $name;
276
$groups{$type}->{$objId}->{$chrom}->{start} ||= 9999999999999;
277
$groups{$type}->{$objId}->{$chrom}->{stop} ||= 0;
278
$groups{$type}->{$objId}->{$chrom}->{start} = $start
279
if $start < $groups{$type}->{$objId}->{$chrom}->{start};
280
$groups{$type}->{$objId}->{$chrom}->{stop} = $stop
281
if $stop > $groups{$type}->{$objId}->{$chrom}->{stop};
282
$groups{$type}->{$objId}->{$chrom}->{source} = $source;
283
$groups{$type}->{$objId}->{$chrom}->{strand} = $strand;
284
$groups{$type}->{$objId}->{$chrom}->{method} = $method;
285
$groups{$type}->{$objId}->{$chrom}->{class} = $class;
286
$method = $subcomponents{$type};
287
next if $type eq 'locus';
289
#This is for internal CSHL usage
290
elsif($type =~ /snp/ && $doTSCSNP)
292
#print " -got refSNP ID: $name, let's do TSC lookup\n";
293
if(my $tscAttributes = &queryTSCdb($dbh,$name))
295
#$FH{$chrom}->print(qq/$chrom\tTSC\tsnp\t$start\t$stop\t.\t$strand\t.\t$tscAttributes\n/);
296
$attributes .= $tscAttributes;
297
#print "\$attributes='$attributes'\n";
301
#Trying to work around the contig pile-up at the start of a chromosome
302
if($method eq 'contig' && $stop == 0)
304
print STDERR "SKIPPING, contig '$name' as stop = $stop and start = $start.\n";
307
#And finally print to the proper output stream
308
$FH{$chrom}->print(qq/$chrom\t$source\t$method\t$start\t$stop\t.\t$strand\t$score\t$attributes\n/);
310
#Collect max coordinates, to deduce chromosome sizes
311
$maxCoords{$chrom} ||= 0;
312
$maxCoords{$chrom} = $stop if $stop > $maxCoords{$chrom};
317
my ($chrom) = $ARGV=~ /0?([0-9,XYxy]{1,2})/;
318
print STDERR "$i total features parsed. Now doing chromosome $chrom";
319
print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
324
#Print out group features like transcripts and genes that
325
#were collected before and print to the proper output streams
326
print "\nPrinting out collected aggregate features\n";
327
foreach my $type(keys %groups)
329
foreach my $objId (keys %{$groups{$type}})
331
#print "\$name='$name'\n";
332
foreach my $chrom(keys %{$groups{$type}->{$objId}})
334
my $name = $groups{$type}->{$objId}->{$chrom}->{name};
335
my $start = $groups{$type}->{$objId}->{$chrom}->{start};
336
my $stop = $groups{$type}->{$objId}->{$chrom}->{stop};
337
my $strand = $groups{$type}->{$objId}->{$chrom}->{strand};
338
my $method = $groups{$type}->{$objId}->{$chrom}->{method};
339
my $class = $groups{$type}->{$objId}->{$chrom}->{class};
340
my $source = $groups{$type}->{$objId}->{$chrom}->{source};
341
if($type eq 'locus' && $doLocuslink)
344
my $ll = $llData->{$objId};
346
my $note = $ll->{desc} ? qq/Note "$name:$ll->{desc}"/ : ' ';
348
$FH{$chrom}->print( qq/$chrom\t$source\t$method\t$start\t$stop\t.\t$strand\t.\tLocus $objId/);
349
$FH{$chrom}->print(qq/; Name $name/) unless $objId eq $name;
350
$FH{$chrom}->print(qq/; $note\n/);
354
$FH{$chrom}->print(qq/$chrom\t$source\t$method\t$start\t$stop\t.\t$strand\t.\t$class $name; Name $name\n/);
360
# $density{$method.'_dens:'.$source}->{$bin}++;
361
#Print out the collected binned density stats
362
print "Printing out density stats\n";
364
foreach my $chrom(sort keys %density)
366
foreach my $meth(sort keys %{$density{$chrom}})
369
foreach my $bin(sort {$a<=>$b}keys %{$density{$chrom}->{$meth}})
372
my $count = $density{$chrom}->{$meth}->{$bin};
373
my $binstart = $bin*$binSize;
374
my $binstop = $binstart+$binSize;
375
print " \$bin=$bin,\$binstart=$binstart,\$count=$count\n";
376
$FH{$chrom}->print(qq/$chrom\tNCBI\t$meth\t$binstart\t$binstop\t$count\t.\t.\t\n/);
378
print " $bc bins for method $meth\n";
382
#Print a line for the reference sequences themselves
383
while(my ($chrom,$max) = each %maxCoords)
385
$FH{$chrom}->print(qq/$chrom\tassembly\tchromosome\t1\t$max\t.\t+\t.\tSequence \"$chrom\"\n/);
388
print "\nDONE. $i features parsed\n\n";
390
#------------------------------------------------
392
#------------------------------------------------
394
#For internal CSHL use. Queries our inhouse MySQL database with
395
#SNP Consortium data for various auxiliary data on some SNPs
403
#Baeat vid herna, na i classification string fra dbSNP
404
my ($note,$tsc_id,$var,$lab,$dbsnp_id,$class,$gene_symbol,$locus_id,$freq);
405
$tsc_sth->execute($rs_id) || die $@;
406
my $tscInfo = $tsc_sth->fetchrow_hashref() || return undef;
408
#while(my($k,$v) = each %$tscInfo){print " '$k'=>'$v'\n";}
409
#dbSNP stuff, may apply to more than just TSC snps
410
$locus_id = $tscInfo->{locus_id};
411
$class = $tscInfo->{fxn_class};
412
$gene_symbol = $tscInfo->{gene_symbol};
413
$attributes .= qq/; SNPClass $class/ if $class;
416
$tsc_id = $tscInfo->{snp_id} || return $attributes;
417
$tsc_id = sprintf("TSC%7.7d", $tsc_id);
418
$var = lc $tscInfo->{variation};
419
$lab = $tscInfo->{institute_code};
420
$dbsnp_id = $tscInfo->{dbsnp_id};
421
$freq = 1 if $tscInfo->{pop_type} && $tscInfo->{outcome} eq 'S';
422
$attributes .= qq/; Alias $tsc_id/;
423
#$attributes .= qq/; Variation $var/ if $var;
424
$attributes .= qq/; AllFreq 1/ if $freq;
425
}while($tscInfo = $tsc_sth->fetchrow_hashref());
426
$note = qq/; Note "$tsc_id($var)"/;
427
$rs_id = $dbh=$tsc_id=$var=$lab=$dbsnp_id=$class=$gene_symbol=$locus_id= $tscInfo=$freq= undef;
428
return $attributes.$note;
436
eval{$dbh = DBI->connect($dsn,
439
FetchHashKeyName => 'NAME_lc',
445
print STDERR "ERROR, cannot connect to DB! $@\n";