7
Circos::Utils - IO routines for Circos
11
This module is not meant to be used directly.
15
Circos is an application for the generation of publication-quality,
16
circularly composited renditions of genomic data and related
19
Circos is particularly suited for visualizing alignments, conservation
20
and intra and inter-chromosomal relationships. However, Circos can be
21
used to plot any kind of 2D data in a circular layout - its use is not
22
limited to genomics. Circos' use of lines to relate position pairs
23
(ribbons add a thickness parameter to each end) is effective to
24
display relationships between objects or positions on one or more
27
All documentation is in the form of tutorials at L<http://www.circos.ca>.
31
# -------------------------------------------------------------------
40
use Carp qw( carp confess croak );
41
use Storable qw(dclone);
45
use File::Spec::Functions;
47
use Math::VecStat qw(sum);
48
use Params::Validate qw(:all);
49
use Regexp::Common qw(number);
51
use POSIX qw(floor ceil);
53
use lib "$FindBin::RealBin";
54
use lib "$FindBin::RealBin/../lib";
55
use lib "$FindBin::RealBin/lib";
57
use Circos::Constants;
59
use Circos::Configuration;
65
# -------------------------------------------------------------------
67
# Read a data file and associated options.
69
# Depending on the data type, the format is one of
72
# chr start end options
74
# - interval data with value (or label)
75
# chr start end value options
77
# where the options string is of the form
79
# var1=value1,var2=value2,...
81
# v0.48 - if min>max, then the data point is tagged with rev=>1
82
# v0.60 - one-line links are now supported
83
# v0.62 - register which ideograms have data for this track
85
my ($file,$type,$options,$KARYOTYPE) = @_;
87
open(F,$file) || fatal_error("io","cannot_read",$file,$type,$!);
88
printdebug_group("io","reading input file",$file,"type",$type);
90
# specify '-' if a field is not used and to be skipped
93
coord_options => [qw(chr start end options)],
94
coord_value_options => [qw(chr start end value options)],
95
coord_label_options => [qw(chr start end label options)],
96
link_twoline => [qw(id chr start end options)],
97
link => [qw(chr start end chr start end options)],
101
scatter => $line_type->{coord_value_options},
102
line => $line_type->{coord_value_options},
103
histogram => $line_type->{coord_value_options},
104
heatmap => $line_type->{coord_value_options},
105
highlight => $line_type->{coord_options},
106
tile => $line_type->{coord_options},
107
text => $line_type->{coord_label_options},
108
connector => $line_type->{coord_options},
109
link_twoline => $line_type->{link_twoline},
110
link => $line_type->{link},
114
chr => { rx => qr/^[\w.:&-]+$/, comment => "word with optional -.:&" },
115
start => { rx => qr/^-?\d+$/, comment => "integer" },
116
end => { rx => qr/^-?\d+$/, comment => "integer" },
117
value => { rx => qr/^($RE{num}{real},?)+$/, comment => "floating point value" },
118
label => { rx => qr/^.+/, comment => "non-empty string" },
119
options => { rx => qr/=/, comment => "variable value pair (x=y)" },
122
my ($data,$prev_value);
123
my ($recnum,$linenum) = (0,0);
127
my $param_type = $type =~ /link/ ? "link" : $type;
129
my $hide_link_twoline;
131
my $delim_rx = fetch_conf("file_delim") || undef;
132
if ($delim_rx && fetch_conf("file_delim_collapse")) {
136
my $num_fields_less_one = @{$fields->{$type}}-1;
141
s/^\s+//; # strip leading spaces
142
s/\s+\#.*//; # strip comments
143
s/\r$//; # strip windows carriage return
144
next if /^(#|$)/; # skip empty lines
146
my @tok = $delim_rx ? split(/$delim_rx/) : split;
147
if($type eq "link" && @tok < 6 ) {
148
$type = "link_twoline";
152
my $datum = { data => [ ], param => { } };
154
for my $i ( 0..$num_fields_less_one ) {
156
my $value = $tok[$i];
157
next unless defined $value;
158
my $field = $fields->{$type}[$i];
160
# not using this at the moment
161
#next if $field eq $DASH;
163
# make sure the value has the right format, if a format rx is available
164
if( $rx->{$field} ) {
165
my ($rx,$rxcomment) = @{$rx->{$field}}{qw(rx comment)};
166
if ( $value !~ /$rx/ ) {
167
fatal_error("parsedata","bad_field_format",$field,$value,$rxcomment,$file,$line);
171
# if this field is 'chr' make sure this chromosome exits
172
if($field eq "chr") {
173
if ( ! exists $KARYOTYPE->{$value} ) {
174
if(fetch_conf("undefined_ideogram") eq "exit") {
175
fatal_error("parsedata","no_such_ideogram",$value,$file,$line);
179
} elsif ( ! $KARYOTYPE->{$value}{chr}{display} ) {
180
# this chromosome is not displayed
181
if($type eq "link_twoline") {
182
$hide_link_twoline->{$datum->{data}[0]{id}}++;
187
if($field eq "id" && $type eq "link_twoline" && $hide_link_twoline->{$value}) {
191
# If this is an options field, store it in the 'param' key.
192
if ( $field eq "options" && defined $value && $value ne $EMPTY_STR) {
194
if($value =~ /[()]/) {
195
# use the slower parser only when we know the options field includes brackets
196
# color=(255,0,0),x=1 -> color=(255,0,0) x=1
197
@params = parse_csv($value);
199
@params = split(",",$value);
202
for my $param ( @params ) {
203
if ($param =~ /^([^=]+)=(.+)$/) {
206
fatal_error("parsedata","bad_options",$param);
209
$datum->{param} = \%params;
211
# all fields are named
212
my $field_name = $field eq "label" ? "value" : $field;
213
# Store this field in the datum's point. If an entry
214
# with this field already exists, another is created.
215
# This automatically accommodates data with multiple
216
# coordinates (e.g. link)
217
if(! exists $datum->{data}[0]{$field_name}) {
218
$datum->{data}[0]{$field_name} = $value;
220
$datum->{data}[1]{$field_name} = $value;
223
if ( $field eq "value" && defined $prev_value) {
224
# min_value_change requies that adjacent data points vary by a minimum amount
225
if ($options->{min_value_change} && abs($value-$prev_value) < $options->{min_value_change} ) {
228
# skip_run avoids consecutive data points with the same value
229
if ($options->{skip_run} && $value eq $prev_value ) {
235
$prev_value = $datum->{data}[0]{value};
237
# verify that this data point is on a drawn ideogram
239
if ( ! is_on_ideogram($datum) ) {
240
if($type eq "link_twoline") {
241
$hide_link_twoline->{$datum->{data}[0]{id}}++;
246
# if the start/end values are reversed, i.e. end<start, then swap
247
# them and set rev flag
249
for my $point (@{$datum->{data}}) {
250
if($point->{start} > $point->{end}) {
251
@{$point}{qw(start end)} = @{$point}{qw(end start)};
259
# if an odd number of coordinates is inverted, label
260
# this datum inverted
261
if($type =~ /link/ && ! defined $datum->{param}{inv}) {
263
$datum->{param}{inv} = 1;
265
$datum->{param}{inv} = 0;
269
# if padding is required, expand the coordinate
270
if($type ne "text" && $type ne "tile") {
271
if (my $padding = $options->{padding} || $datum->{param}{padding} ) {
272
for my $point (@{$datum->{data}}) {
273
$point->{start} -= $padding;
274
$point->{end} += $padding;
279
# if the minsize parameter is set, then the coordinate span is
280
# expanded to be at least this value
281
if (my $minsize = $options->{minsize} || $datum->{param}{minsize}) {
282
Circos::DataPoint::apply_filter("minsize",
288
# if a set structure was requested, make it
289
if ($options->{addset}) {
290
for my $point (@{$datum->{data}}) {
291
$point->{set} = make_set( @{$point}{qw(start end)});
295
if ($type eq "link_twoline") {
296
my $linkid = $datum->{data}[0]{id};
297
die "no link id".Dumper($datum) if ! defined $linkid;
298
if(! $hide_link_twoline->{$linkid}) {
299
push @{$data->{$linkid}}, $datum;
301
} elsif ( $type eq "histogram" && defined $datum->{data}[0]{value} && $datum->{data}[0]{value} =~ /,/ ) {
303
# for stacked histograms where values are comma separated
305
my @values = split( /,/, $datum->{data}[0]{value} );
306
my ( @values_sorted, @values_idx_sorted );
307
if ( $options->{sort_bin_values} ) {
308
@values_sorted = sort { $b <=> $a } @values;
311
sort { $b->[1] <=> $a->[1] }
312
map { [ $_, $values[$_] ] } ( 0 .. @values - 1 );
314
@values_sorted = @values;
315
@values_idx_sorted = ( 0 .. @values - 1 );
318
for my $i ( 0 .. @values - 1 ) {
319
# first value has the highest z
320
my $z = @values - $i;
321
my $cumulsum = sum( @values_sorted[ 0 .. $i ] );
322
my $thisdatum = dclone($datum);
324
$thisdatum->{data}[0]{value} = $cumulsum;
325
$thisdatum->{param}{z} = $z;
327
if ( $options->{param} ) {
328
for my $param ( keys %{ $options->{param} } ) {
329
my $value = $datum->{param}{$param} || $options->{param}{$param};
330
next unless defined $value;
332
if ($param eq "fill_color") {
333
@param_values = Circos::color_to_list($value);
335
@param_values = split($COMMA, $value)
337
next unless @param_values;
338
my $param_value = $param_values[ $values_idx_sorted[$i] % @param_values ];
339
$thisdatum->{param}{$param} = $param_value;
340
$thisdatum->{param}{stacked} = 1;
343
push @{$data}, $thisdatum;
346
push @{$data}, $datum;
349
if($options->{record_limit} && $recnum >= $options->{record_limit}) {
350
if($type eq "link_twoline") {
351
$hide_link_twoline->{$datum->{data}[0]{id}}++;
357
printdebug_group("io","read",$recnum."/".$linenum,"records/lines");
358
# for old-style links (defined on two lines), collect the
359
# individual link ends, keyed by the record number, into a single
361
if($type eq "link_twoline") {
363
for my $linkid (sort keys %$data) {
364
next if $hide_link_twoline->{$linkid};
366
for my $datum (@{$data->{$linkid}}) {
367
push @{$datum_new->{data}}, $datum->{data}[0];
368
for my $param (keys %{$datum->{param}}) {
369
$datum_new->{param}{$param} = $datum->{param}{$param};
372
my $num_coords = @{$datum_new->{data}};
373
if($num_coords == 1) {
374
fatal_error("links","single_entry",$linkid,$file,Dumper($datum_new));
375
} elsif($num_coords > 2) {
376
fatal_error("links","too_many_entries",$linkid,$file,$num_coords,Dumper($datum_new));
378
push @$data_new, $datum_new;
382
# finally parse the params for data points that have been kept
383
for my $point (@$data) {
384
if($point->{param}) {
385
$point->{param} = Circos::parse_parameters($point->{param},$param_type);