3
use constant ACEDB => 'sace://aceserver.cshl.org:2005';
8
my @framework = qw(mex-3 spe-15 lin-17 unc-11 dhc-1 unc-40 smg-5
9
unc-13 unc-29 eat-16 lin-11 spe-9 par-6 unc-59 unc-54 mab-9 lin-42
10
sri-71 smu-2 vab-1 bli-2 dpy-10 him-14 mig-5 unc-4 bli-1 sqt-1 rol-1
11
his-14 unc-52 unc-45 par-2 let-805 sel-8 mab-21 daf-4 sma-3 lin-39
12
unc-32 tax-4 ced-9 tra-1 nob-1 daf-1 ced-2 lin-1 unc-17 dpy-13 unc-5
13
smg-7 dif-1 lin-49 elt-1 daf-14 dpy-20 dpy-26 unc-30 tra-3 sup-24
14
rho-1 egl-8 unc-60 srh-36 apx-1 unc-62 let-418 dpy-11 let-413 sel-9
15
unc-42 egl-9 sma-1 sqt-3 odr-3 hda-1 unc-76 gcy-20 skr-5 par-4 unc-51
16
egl-17 lim-6 fox-1 fax-1 lon-2 unc-97 unc-6 unc-18 mec-10 sop-1 mab-18
17
sdc-2 odr-7 unc-9 unc-3 gas-1 ace-1);
18
my %framework = map {$_=>1} @framework;
19
my %framework_seen = ();
22
This script massages the Wormbase GFF files located at
23
ftp://www.wormbase.org/pub/wormbase/GENE_DUMPS into a version of the
24
GFF format suitable for display by the generic genome browser. It
25
mainly adds comments to the annotations and designates certain
26
well-spaced genetic loci as framework landmarks.
28
This script requires the AcePerl distribution, which is available on
29
CPAN (look for the "Ace" module).
31
To use this script, get the WormBase GFF files from the FTP site
32
listed above and place them in a directory. It might be a good idea
33
to name the directory after the current release, such as WS61. You do
34
not need to uncompress the files.
36
Then give that directory as the argument to this script and capture
37
the script's output to a file:
39
% process_wormbase.pl ./WS61 > wormbase.gff
41
It may take a while before you see output from this script, since it
42
must first fetch gene and protein database from the remote AceDB
43
running at www.wormbase.org.
44
The wormbase.gff file can then be loaded into a Bio::DB::GFF database
45
using the following command:
47
% bulk_load_gff.pl -d <databasename> wormbase.gff
52
die $USAGE if $ARGV[0]=~/^-?-h/i;
54
my $db = Ace->connect(-url=>ACEDB,
55
-query_timeout=>500) or die "Can't open ace database:",Ace->error;
58
@ARGV = <$ARGV[0]/*.gff.gz>;
63
foreach (@ARGV) { # GFF FILES
64
$_ = "gunzip -c $_ |" if /\.gz$/;
67
my (%NOTES,%LOCUS,%GENBANK,%CONFIRMED,%ORFEOME);
68
get_confirmed($db,\%CONFIRMED);
69
get_genbank($db,\%GENBANK);
70
get_loci($db,\%LOCUS);
71
get_notes($db,\%NOTES);
72
get_orfeome($db,\%ORFEOME);
77
my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split /\t/;
78
next if $source eq 'assembly_tag'; # don't want 'em, don't need 'em
79
$ref =~ s/^CHROMOSOME_//;
80
$group =~ s/CHROMOSOME_//;
82
$source ='' if $source eq '*UNKNOWN*';
84
if ($method eq 'Sequence' && ($source eq 'curated' || $source eq 'RNA') && $group =~ /Sequence "(\w+\.\d+[a-z]?)"/) {
86
push @notes,map { qq(Note "$_") } @{$NOTES{$1}} if $NOTES{$1};
87
push @notes,map { qq(Note "$_") } @{$LOCUS{$1}} if $LOCUS{$1};
88
push @notes,qq(Confirmed_by "$CONFIRMED{$1}") if $CONFIRMED{$1};
89
$group = join ' ; ',$group,@notes;
90
if (my $loci = $LOCUS{$1}) {
92
print join("\t",$ref,$source,'gene',$start,$stop,$score,$strand,$phase,"Locus $_"),"\n";
93
print join("\t",$ref,'framework','gene',$start,$stop,$score,$strand,$phase,"Locus $_"),"\n"
94
if $framework{$_} && !$framework_seen{$_}++;
99
if ($method eq 'Sequence' && $source eq 'Genomic_canonical' && $group =~ /Sequence "(\w+)"/) {
100
if (my $accession = $GENBANK{$1}) {
101
$group .= qq( ; Note "Genbank $accession");
102
print join("\t",$ref,'Genbank',$method,$start,$stop,$score,$strand,$phase,"Genbank \"$accession\""),"\n";
106
if ($method eq 'reagent' && $source eq 'Orfeome_project' && $group =~ /PCR_product "([^\"]+)"/) {
107
my $amp = $ORFEOME{$1};
108
$group .= qq( ; Amplified $amp) if defined $amp;
111
# fix variant fields: Variant "T" => Note "T"
112
$group =~ s/(?:Variant|Insert) "(\w+)"/Note "$1"/;
115
if ($group =~ /UTR "([35])_UTR:(\S+)"/) {
118
$group = qq(Sequence "$2");
121
print join("\t",$ref,$source,$method,$start,$stop,$score,$strand,$phase,$group),"\n";
125
my ($db,$hash) = @_; # hash keys are predicted gene names, values are one or more loci names
126
my @genes = $db->fetch(-query=>'find Locus Genomic_sequence',-filltag=>'Genomic_sequence');
127
foreach my $obj (@genes) {
128
my @genomic = $obj->Genomic_sequence or next;
130
push @{$hash->{$_}},$obj;
136
my ($db,$hash) = @_; # hash keys are predicted gene names, values are one or more brief identifications
137
my @genes = $db->fetch(-query=>'find Sequence Brief_identification',-filltag=>'Brief_identification');
138
foreach my $obj (@genes) {
139
my @notes = $obj->Brief_identification or next;
140
$hash->{$obj} = \@notes;
145
my ($db,$hash) = @_; # hash keys are cosmid names, values are genbank accessions (1 to 1)
146
my @cosmids = $db->fetch(-query=>'find Genome_Sequence Database',-filltag=>'Database');
147
for my $cosmid (@cosmids) {
148
my ($database,undef,$accession) = $cosmid->Database(1)->row;
149
next unless $accession;
150
$hash->{$cosmid} = $accession;
155
my ($db,$hash) = @_; # hash keys are predicted gene names, values are confirmation type
156
my @confirmed = $db->fetch(-query=>'find Sequence Confirmed_by',-filltag=>'Confirmed_by');
157
foreach my $obj (@confirmed) {
158
my $confirmed_by = $obj->Confirmed_by || 'Unknown';
159
$hash->{$obj} = $confirmed_by;
165
my @mv_primers = $db->fetch(-query=>'find PCR_Product mv*',-filltag=>'Amplified');
166
for my $obj (@mv_primers) {
167
my $amplified = $obj->Amplified;
168
$hash->{$obj} = $amplified;
176
bp_process_wormbase.pl - Massage WormBase GFF files into a version suitable for the Generic Genome Browser
180
% bp_process_wormbase.pl ./WS61 > wormbase.gff
184
This script massages the Wormbase GFF files located at
185
ftp://www.wormbase.org/pub/wormbase/GENE_DUMPS into a version of the
186
GFF format suitable for display by the generic genome browser. It
187
mainly adds comments to the annotations and designates certain
188
well-spaced genetic loci as framework landmarks.
190
This script requires the AcePerl distribution, which is available on
191
CPAN (look for the "Ace" module).
193
To use this script, get the WormBase GFF files from the FTP site
194
listed above and place them in a directory. It might be a good idea
195
to name the directory after the current release, such as WS61. You do
196
not need to uncompress the files.
198
Then give that directory as the argument to this script and capture
199
the script's output to a file:
201
% bp_process_wormbase.pl ./WS61 > wormbase.gff
203
It may take a while before you see output from this script, since it
204
must first fetch gene and protein database from the remote AceDB
205
running at www.wormbase.org.
206
The wormbase.gff file can then be loaded into a Bio::DB::GFF database
207
using the following command:
209
% bulk_load_gff.pl -d <databasename> wormbase.gff
213
L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
217
Lincoln Stein E<lt>lstein@cshl.orgE<gt>
219
Copyright (c) 2002 Cold Spring Harbor Laboratory
221
This library is free software; you can redistribute it and/or modify
222
it under the same terms as Perl itself. See DISCLAIMER.txt for
223
disclaimers of warranty.