~ubuntu-branches/ubuntu/saucy/circos/saucy-proposed

« back to all changes in this revision

Viewing changes to lib/Circos/IO.pm

  • Committer: Package Import Robot
  • Author(s): Olivier Sallou
  • Date: 2013-05-20 09:01:27 UTC
  • mfrom: (1.1.3)
  • Revision ID: package-import@ubuntu.com-20130520090127-s5nbumg8563x00ee
Tags: 0.64-1
New upstream release

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
package Circos::IO;
 
2
 
 
3
=pod
 
4
 
 
5
=head1 NAME
 
6
 
 
7
Circos::Utils - IO routines for Circos
 
8
 
 
9
=head1 SYNOPSIS
 
10
 
 
11
This module is not meant to be used directly.
 
12
 
 
13
=head1 DESCRIPTION
 
14
 
 
15
Circos is an application for the generation of publication-quality,
 
16
circularly composited renditions of genomic data and related
 
17
annotations.
 
18
 
 
19
Circos is particularly suited for visualizing alignments, conservation
 
20
and intra and inter-chromosomal relationships. However, Circos can be
 
21
used to plot any kind of 2D data in a circular layout - its use is not
 
22
limited to genomics. Circos' use of lines to relate position pairs
 
23
(ribbons add a thickness parameter to each end) is effective to
 
24
display relationships between objects or positions on one or more
 
25
scales.
 
26
 
 
27
All documentation is in the form of tutorials at L<http://www.circos.ca>.
 
28
 
 
29
=cut
 
30
 
 
31
# -------------------------------------------------------------------
 
32
 
 
33
use strict;
 
34
use warnings;
 
35
 
 
36
use base 'Exporter';
 
37
our @EXPORT = qw(
 
38
);
 
39
 
 
40
use Carp qw( carp confess croak );
 
41
use Storable qw(dclone);
 
42
use Cwd;
 
43
use FindBin;
 
44
use Data::Dumper;
 
45
use File::Spec::Functions;
 
46
use Math::Round;
 
47
use Math::VecStat qw(sum);
 
48
use Params::Validate qw(:all);
 
49
use Regexp::Common qw(number);
 
50
 
 
51
use POSIX qw(floor ceil);
 
52
 
 
53
use lib "$FindBin::RealBin";
 
54
use lib "$FindBin::RealBin/../lib";
 
55
use lib "$FindBin::RealBin/lib";
 
56
 
 
57
use Circos::Constants;
 
58
use Circos::Colors;
 
59
use Circos::Configuration;
 
60
use Circos::Debug;
 
61
use Circos::Error;
 
62
use Circos::Ideogram;
 
63
use Circos::Utils;
 
64
 
 
65
# -------------------------------------------------------------------
 
66
sub read_data_file {
 
67
        # Read a data file and associated options.
 
68
        #
 
69
        # Depending on the data type, the format is one of
 
70
        #
 
71
        # - interval data
 
72
        # chr start end options
 
73
        #
 
74
        # - interval data with value (or label)
 
75
        # chr start end value options
 
76
        #
 
77
        # where the options string is of the form
 
78
        #
 
79
        # var1=value1,var2=value2,...
 
80
        #
 
81
        # v0.48 - if min>max, then the data point is tagged with rev=>1
 
82
        # v0.60 - one-line links are now supported
 
83
        # v0.62 - register which ideograms have data for this track
 
84
 
 
85
        my ($file,$type,$options,$KARYOTYPE) = @_;
 
86
 
 
87
        open(F,$file) || fatal_error("io","cannot_read",$file,$type,$!);
 
88
        printdebug_group("io","reading input file",$file,"type",$type);
 
89
 
 
90
        # specify '-' if a field is not used and to be skipped
 
91
 
 
92
        my $line_type = {
 
93
                                                                         coord_options       => [qw(chr start end options)],
 
94
                                                                         coord_value_options => [qw(chr start end value options)],
 
95
                                                                         coord_label_options => [qw(chr start end label options)],
 
96
                                                                         link_twoline        => [qw(id chr start end options)],
 
97
                                                                         link                => [qw(chr start end chr start end options)],
 
98
                                                                        };
 
99
 
 
100
        my $fields = {
 
101
                                                                scatter      => $line_type->{coord_value_options},
 
102
                                                                line         => $line_type->{coord_value_options},
 
103
                                                                histogram    => $line_type->{coord_value_options},
 
104
                                                                heatmap      => $line_type->{coord_value_options},
 
105
                                                                highlight    => $line_type->{coord_options},
 
106
                                                                tile         => $line_type->{coord_options},
 
107
                                                                text         => $line_type->{coord_label_options},
 
108
                                                                connector    => $line_type->{coord_options},
 
109
                                                                link_twoline => $line_type->{link_twoline},
 
110
                                                                link         => $line_type->{link},
 
111
                                                         };
 
112
 
 
113
        my $rx = {
 
114
                                                chr     => { rx => qr/^[\w.:&-]+$/,      comment => "word with optional -.:&" },
 
115
                                                start   => { rx => qr/^-?\d+$/,          comment => "integer" },
 
116
                                                end     => { rx => qr/^-?\d+$/,          comment => "integer" },
 
117
                                                value   => { rx => qr/^($RE{num}{real},?)+$/, comment => "floating point value" },
 
118
                                                label   => { rx => qr/^.+/,              comment => "non-empty string" },
 
119
                                                options => { rx => qr/=/,                comment => "variable value pair (x=y)" },
 
120
                                         };
 
121
 
 
122
        my ($data,$prev_value);
 
123
        my ($recnum,$linenum) = (0,0);
 
124
 
 
125
        start_timer("io");
 
126
 
 
127
        my $param_type = $type =~ /link/ ? "link" : $type;
 
128
 
 
129
        my $hide_link_twoline;
 
130
 
 
131
        my $delim_rx = fetch_conf("file_delim") || undef;
 
132
        if ($delim_rx && fetch_conf("file_delim_collapse")) {
 
133
                $delim_rx .= "+";
 
134
        }
 
135
 
 
136
        my $num_fields_less_one = @{$fields->{$type}}-1;
 
137
 
 
138
 LINE:
 
139
        while (<F>) {
 
140
                chomp;
 
141
                s/^\s+//;         # strip leading spaces
 
142
                s/\s+\#.*//;      # strip comments
 
143
                s/\r$//;          # strip windows carriage return
 
144
                next if /^(#|$)/; # skip empty lines
 
145
                $linenum++;
 
146
                my @tok = $delim_rx ? split(/$delim_rx/) : split;
 
147
                if($type eq "link" && @tok < 6 ) {
 
148
                        $type = "link_twoline";
 
149
                }
 
150
 
 
151
                my $line  = $_;
 
152
                my $datum = { data => [ ], param => { } };
 
153
 
 
154
                for my $i ( 0..$num_fields_less_one ) {
 
155
 
 
156
            my $value = $tok[$i];
 
157
                        next unless defined $value;
 
158
            my $field = $fields->{$type}[$i];
 
159
 
 
160
                        # not using this at the moment
 
161
            #next if $field eq $DASH;
 
162
                        
 
163
                        # make sure the value has the right format, if a format rx is available
 
164
                        if( $rx->{$field} ) {
 
165
                                my ($rx,$rxcomment) = @{$rx->{$field}}{qw(rx comment)};
 
166
                                if ( $value !~ /$rx/ ) {
 
167
                                        fatal_error("parsedata","bad_field_format",$field,$value,$rxcomment,$file,$line);
 
168
                                }
 
169
                        }
 
170
 
 
171
                        # if this field is 'chr' make sure this chromosome exits
 
172
                        if($field eq "chr") {
 
173
                                if ( ! exists $KARYOTYPE->{$value} ) {
 
174
                                        if(fetch_conf("undefined_ideogram") eq "exit") {
 
175
                                                fatal_error("parsedata","no_such_ideogram",$value,$file,$line);
 
176
                                        } else {
 
177
                                                next LINE;
 
178
                                        }
 
179
                                } elsif ( ! $KARYOTYPE->{$value}{chr}{display} ) {
 
180
                                        # this chromosome is not displayed
 
181
                                        if($type eq "link_twoline") {
 
182
                                                $hide_link_twoline->{$datum->{data}[0]{id}}++;
 
183
                                        }
 
184
                                        next LINE;
 
185
                                }
 
186
                        }
 
187
                        if($field eq "id" && $type eq "link_twoline" && $hide_link_twoline->{$value}) {
 
188
                                next LINE;
 
189
                        }
 
190
 
 
191
                        # If this is an options field, store it in the 'param' key.
 
192
            if ( $field eq "options" && defined $value && $value ne $EMPTY_STR) {
 
193
                                my @params;
 
194
                                if($value =~ /[()]/) {
 
195
                                        # use the slower parser only when we know the options field includes brackets
 
196
                                        # color=(255,0,0),x=1 -> color=(255,0,0) x=1
 
197
                                        @params = parse_csv($value);
 
198
                                } else {
 
199
                                        @params = split(",",$value);
 
200
                                }
 
201
                                my %params;
 
202
                                for my $param ( @params ) {
 
203
                                        if ($param =~ /^([^=]+)=(.+)$/) {
 
204
                                                $params{$1} = $2;
 
205
                                        } else {
 
206
                                                fatal_error("parsedata","bad_options",$param);
 
207
                                        }
 
208
                                }
 
209
                                $datum->{param} = \%params;
 
210
            } else {
 
211
                                # all fields are named 
 
212
                                my $field_name = $field eq "label" ? "value" : $field;
 
213
                                # Store this field in the datum's point. If an entry
 
214
                                # with this field already exists, another is created.
 
215
                                # This automatically accommodates data with multiple
 
216
                                # coordinates (e.g. link)
 
217
                                if(! exists $datum->{data}[0]{$field_name}) {
 
218
                                        $datum->{data}[0]{$field_name} = $value;
 
219
                                } else {
 
220
                                        $datum->{data}[1]{$field_name} = $value;
 
221
                                }
 
222
            }
 
223
                        if ( $field eq "value" && defined $prev_value) {
 
224
                                # min_value_change requies that adjacent data points vary by a minimum amount
 
225
                                if ($options->{min_value_change} && abs($value-$prev_value) < $options->{min_value_change} ) {
 
226
                                        next LINE;
 
227
                                }
 
228
                                # skip_run avoids consecutive data points with the same value
 
229
                                if ($options->{skip_run} &&     $value eq $prev_value ) {
 
230
                                        next LINE;
 
231
                                }
 
232
            }
 
233
                }
 
234
 
 
235
                $prev_value = $datum->{data}[0]{value};
 
236
 
 
237
                # verify that this data point is on a drawn ideogram
 
238
 
 
239
                if ( ! is_on_ideogram($datum) ) {
 
240
                        if($type eq "link_twoline") {
 
241
                                $hide_link_twoline->{$datum->{data}[0]{id}}++;
 
242
                        }
 
243
                        next LINE;
 
244
                }
 
245
        
 
246
                # if the start/end values are reversed, i.e. end<start, then swap
 
247
                # them and set rev flag
 
248
                my $num_rev = 0;
 
249
                for my $point (@{$datum->{data}}) {
 
250
                        if($point->{start} > $point->{end}) {
 
251
                                @{$point}{qw(start end)} = @{$point}{qw(end start)};
 
252
                                $point->{rev} = 1;
 
253
                                $num_rev++;
 
254
                        } else {
 
255
                                $point->{rev} = 0;
 
256
                        }
 
257
                }
 
258
 
 
259
                # if an odd number of coordinates is inverted, label
 
260
                # this datum inverted
 
261
                if($type =~ /link/ && ! defined $datum->{param}{inv}) {
 
262
                        if($num_rev % 2) {
 
263
                                $datum->{param}{inv} = 1;
 
264
                        } else {
 
265
                                $datum->{param}{inv} = 0;
 
266
                        }
 
267
                }
 
268
 
 
269
                # if padding is required, expand the coordinate
 
270
                if($type ne "text" && $type ne "tile") {
 
271
                        if (my $padding = $options->{padding} || $datum->{param}{padding} ) {
 
272
                                for my $point (@{$datum->{data}}) {
 
273
                                        $point->{start} -= $padding;
 
274
                                        $point->{end}   += $padding;
 
275
                                }
 
276
                        }
 
277
                }
 
278
 
 
279
                # if the minsize parameter is set, then the coordinate span is
 
280
                # expanded to be at least this value
 
281
                if (my $minsize = $options->{minsize} || $datum->{param}{minsize}) {
 
282
                        Circos::DataPoint::apply_filter("minsize",
 
283
                                                                                                                                                        $minsize,
 
284
                                                                                                                                                        $datum);
 
285
                        
 
286
                }
 
287
 
 
288
                # if a set structure was requested, make it
 
289
                if ($options->{addset}) {
 
290
                        for my $point (@{$datum->{data}}) {
 
291
                                $point->{set} = make_set( @{$point}{qw(start end)});
 
292
                        }
 
293
                }
 
294
 
 
295
                if ($type eq "link_twoline") {
 
296
                        my $linkid = $datum->{data}[0]{id};
 
297
                        die "no link id".Dumper($datum) if ! defined $linkid;
 
298
                        if(! $hide_link_twoline->{$linkid}) {
 
299
                                push @{$data->{$linkid}}, $datum;
 
300
                        }
 
301
                } elsif ( $type eq "histogram" && defined $datum->{data}[0]{value} && $datum->{data}[0]{value} =~ /,/ ) {
 
302
                        #
 
303
                        # for stacked histograms where values are comma separated
 
304
                        #
 
305
                        my @values = split( /,/, $datum->{data}[0]{value} );
 
306
                        my ( @values_sorted, @values_idx_sorted );
 
307
                        if ( $options->{sort_bin_values} ) {
 
308
                                @values_sorted = sort { $b <=> $a } @values;
 
309
                                @values_idx_sorted =
 
310
                                        map  { $_->[0] }
 
311
                                                sort { $b->[1] <=> $a->[1] }
 
312
                                                        map  { [ $_, $values[$_] ] } ( 0 .. @values - 1 );
 
313
                        } else {
 
314
                                @values_sorted     = @values;
 
315
                                @values_idx_sorted = ( 0 .. @values - 1 );
 
316
                        }
 
317
 
 
318
                        for my $i ( 0 .. @values - 1 ) {
 
319
                                # first value has the highest z
 
320
                                my $z         = @values - $i;
 
321
                                my $cumulsum  = sum( @values_sorted[ 0 .. $i ] );
 
322
                                my $thisdatum = dclone($datum);
 
323
 
 
324
                                $thisdatum->{data}[0]{value} = $cumulsum;
 
325
                                $thisdatum->{param}{z}       = $z;
 
326
 
 
327
                                if ( $options->{param} ) {
 
328
                                        for my $param ( keys %{ $options->{param} } ) {
 
329
                                                my $value = $datum->{param}{$param} || $options->{param}{$param};
 
330
                                                next unless defined $value;
 
331
                                                my @param_values;
 
332
                                                if ($param eq "fill_color") {
 
333
                                                        @param_values = Circos::color_to_list($value);
 
334
                                                } else {
 
335
                                                        @param_values = split($COMMA, $value)
 
336
                                                }
 
337
                                                next unless @param_values;
 
338
                                                my $param_value = $param_values[ $values_idx_sorted[$i] % @param_values ];
 
339
                                                $thisdatum->{param}{$param}  = $param_value;
 
340
                                                $thisdatum->{param}{stacked} = 1;
 
341
                                        }
 
342
                                }
 
343
                                push @{$data}, $thisdatum;
 
344
                        }
 
345
                } else {
 
346
                        push @{$data}, $datum;
 
347
                }
 
348
                $recnum++;
 
349
                if($options->{record_limit} && $recnum >= $options->{record_limit}) {
 
350
                        if($type eq "link_twoline") {
 
351
                                $hide_link_twoline->{$datum->{data}[0]{id}}++;
 
352
                        }
 
353
                        last;
 
354
                }
 
355
        }
 
356
        stop_timer("io");
 
357
        printdebug_group("io","read",$recnum."/".$linenum,"records/lines");    
 
358
        # for old-style links (defined on two lines), collect the
 
359
        # individual link ends, keyed by the record number, into a single
 
360
        # data structure
 
361
        if($type eq "link_twoline") {
 
362
                my $data_new = [];
 
363
                for my $linkid (sort keys %$data) {
 
364
                        next if $hide_link_twoline->{$linkid};
 
365
                        my $datum_new;
 
366
                        for my $datum (@{$data->{$linkid}}) {
 
367
                                push @{$datum_new->{data}}, $datum->{data}[0];
 
368
                                for my $param (keys %{$datum->{param}}) {
 
369
                                        $datum_new->{param}{$param} = $datum->{param}{$param};
 
370
                                }
 
371
                        }
 
372
                        my $num_coords = @{$datum_new->{data}};
 
373
                        if($num_coords == 1) {
 
374
                                fatal_error("links","single_entry",$linkid,$file,Dumper($datum_new));
 
375
                        } elsif($num_coords > 2) {
 
376
                                fatal_error("links","too_many_entries",$linkid,$file,$num_coords,Dumper($datum_new));
 
377
                        }
 
378
                        push @$data_new, $datum_new;
 
379
                }
 
380
                $data = $data_new;
 
381
        }
 
382
        # finally parse the params for data points that have been kept
 
383
        for my $point (@$data) {
 
384
                if($point->{param}) {
 
385
                        $point->{param} = Circos::parse_parameters($point->{param},$param_type);
 
386
                }
 
387
        }
 
388
        return $data;
 
389
}
 
390
 
 
391
1;