1
# text.rb : A simple backend to deal with basic text files.
2
# Copyright (C) 2006 Vincent Fourmond
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
require 'Dobjects/Dvector'
21
require 'Dobjects/Function'
28
Version::register_svn_info('$Revision: 191 $', '$Date: 2010-11-07 15:53:08 +0100 (Sun, 07 Nov 2010) $')
33
# A module for easy use of NaN in operations
43
class TextBackend < Backend
45
# A constant holding a relation extension -> command to
46
# decompress (to be fed to sprintf with the filename as argument)
48
".gz" => "gunzip -c %s",
49
".bz2" => "bunzip2 -c %s",
50
".lzma" => "unlzma -c %s",
51
".lz" => "unlzma -c %s",
52
".xz" => "unxz -c %s",
57
describe 'text', 'Text format', <<EOD
58
This backend can read text files in a format close to the one understood
59
by gnuplot and the like.
62
# Inherit the baseline handling, can be useful !
63
# inherit_parameters :base_line
65
param_accessor :skip, 'skip', "Skip lines", 'integer',
66
"Number of lines to be skipped at the beginning of the file"
68
param_accessor :default_column_spec, 'col',
69
"Default column specification", 'text',
70
"Which columns to use when the @1:2 syntax is not used"
72
param_accessor :split, 'split', "Split into subsets", 'boolean',
73
"If true, splits files into subsets on blank/non number lines"
76
param_accessor :separator, 'separator', "Data columns separator",
78
"The columns separator. Defaults to /\s+/"
80
param_accessor :param_regex, 'parameters', "Parameters parsing",
82
"Regular expression for extracting parameters from a file. Defaults to nil (ie nothing)"
84
param_accessor :header_line_regex, 'header-line',
85
'Header line regular expression',
87
"Regular expression indicating the header line (containing column names) (default /^##/"
92
# Current is the name of the last file used. Necessary for '' specs.
93
@current_data = nil # The data of the last file used.
95
@included_modules = [NaN] # to make sure we give them to
96
# Dvector.compute_formula
97
@default_column_spec = "1:2"
101
# We don't split data by default.
106
@header_line_regex = /^\#\#\s*/
110
# Override Backend's cache - for now.
111
@cache = {} # A cache file_name -> data
113
@param_cache = {} # Same thing as cache, but for parameters
115
@headers_cache = {} # Same thing as cache, but for header
122
@included_modules << mod
125
# Expands specifications into few sets. This function will separate the
126
# set into a file spec and a col spec. Within the col spec, the 2##6
127
# keyword is used to expand to 2,3,4,5,6. 2## followed by a non-digit
128
# expands to 2,...,last column in the file. For now, the expansions
129
# stops on the first occurence found, and the second form doesn't
130
# work yet. But soon...
131
def expand_sets(spec)
132
if m = /(\d+)##(\D|$)/.match(spec)
139
ret << m.pre_match + i.to_s + trail + m.post_match
151
# Returns a IO object suitable to acquire data from it for
152
# the given _file_, which can be one of the following:
154
# * a compressed file name
156
def get_io_object(file)
159
elsif file =~ /(.*?)\|\s*$/ # A pipe
161
elsif not File.readable?(file)
162
# Try to find a compressed version
163
for ext,method in UNCOMPRESSORS
164
if File.readable? "#{file}#{ext}"
165
info { "Using compressed file #{file}#{ext} in stead of #{file}" }
166
return IO.popen(method % "#{file}#{ext}")
170
for ext, method in UNCOMPRESSORS
172
info { "Taking file #{file} as a compressed file" }
173
return IO.popen(method % file)
176
return File::open(file)
178
error { "Could not open #{file}" }
182
# A line is invalid if it is blank or starts
183
# neither with a digit nor +, - or .
185
# Maybe to be improved later.
186
InvalidLineRE = /^\s*$|^\s*[^\d+.\s-]+/
188
# Returns a string corresponding to the given _set_ of the
192
def get_set_string(io, set)
194
last_line_is_invalid = true
199
if line =~ InvalidLineRE
200
debug { "Found invalid line at #{line_number}" }
201
if ! last_line_is_invalid
202
# We begin a new set.
204
debug { "Found set #{cur_set} at line #{line_number}" }
209
last_line_is_invalid = true
211
last_line_is_invalid = false
220
# Returns an IO object corresponding to the given file.
223
return get_io_object(file)
225
file =~ /(.*?)(?:#(\d+))?$/; # ; to make ruby-mode indent correctly.
232
debug { "Trying to get set #{set} from file '#{filename}'" }
233
str = get_set_string(get_io_object(filename), set)
234
return StringIO.new(str)
239
# A proper writer for @param_regex
240
def param_regex=(val)
243
elsif val =~ /([^\\]|^)\(/ # Has capturing groups
244
@param_regex = /#{val}/
245
else # Treat as separator
246
@param_regex = /(\S+)\s*#{val}\s*(\S+)/
250
# Turns an array of comments into a hash[param] -> value
251
def parse_parameters(comments)
254
if line =~ @param_regex
261
# Turns an array of comments into a hash column name -> column
263
def parse_header_line(comments)
265
if line =~ @header_line_regex
266
colnames = line.gsub(@header_line_regex,'').split(@separator)
279
# Reads data from a file. If needed, extract the file from the
280
# columns specification.
282
# \todo the cache really should include things such as time of
283
# last modification and various parameters that influence the
284
# reading of the file, and the parameters read from the file
285
# using #parse_parameters
287
# \todo There should be a real global handling of meta-data
288
# extracted from files, so that they could be included for
289
# instance in the automatic labels ? (and we could have fun
290
# improving this one ?)
292
# \warning This needs Tioga r561
297
name = file # As file will be modified.
298
if ! @cache.key?(file) # Read the file if it is not cached.
300
fancy_read_options = {'index_col' => true,
301
'skip_first' => @skip,
303
'comment_out' => comments
305
io_set = get_io_set(file)
306
debug { "Fancy read '#{file}', options #{fancy_read_options.inspect}" }
307
@cache[name] = Dvector.fancy_read(io_set, nil, fancy_read_options)
310
@param_cache[name] = parse_parameters(comments)
311
info { "Read #{@param_cache[name].size} parameters from #{name}" }
312
debug { "Parameters read: #{@param_cache[name].inspect}" }
314
if @header_line_regex
315
@headers_cache[name] = parse_header_line(comments)
316
info { "Read #{@headers_cache[name].size} column names from #{name}" }
317
debug { "Got: #{@headers_cache[name].inspect}" }
320
## @todo These are not very satisfying; ideally, the data
321
## information should be embedded into @cache[name] rather
322
## than as external variables. Well...
323
@current_parameters = @param_cache[name]
324
@current_header = @headers_cache[name]
329
# This is called by the architecture to get the data. It
330
# splits the set name into filename@cols, reads the file if
331
# necessary and calls get_data
332
def query_dataset(set)
333
if set =~ /(.*)@(.*)/
337
col_spec = @default_column_spec
341
@current_data = read_file(file)
345
# Wether we need or not to compute formulas:
347
compute_formulas = true
349
compute_formulas = false
352
return Dataset.dataset_from_spec(set, col_spec) do |col|
353
get_data_column(col, compute_formulas,
354
@current_parameters, @current_header)
358
# Gets the data corresponding to the given column. If
359
# _compute_formulas_ is true, the column specification is
360
# taken to be a formula (in the spirit of gnuplot's)
361
def get_data_column(column, compute_formulas = false,
362
parameters = nil, header = nil)
366
for k,v in parameters
367
formula.gsub!(/\b#{k}\b/, v.to_s)
370
formula.gsub!(/\$(\d+)/, 'column[\1]')
373
formula.gsub!("$#{k}$", "column[#{v}]")
376
debug { "Using formula #{formula} for column spec: #{column}" }
377
return Dvector.compute_formula(formula,
381
return @current_data[column.to_i].dup
385
# # Turns a target => values specification into something usable as
386
# # error bars, that is :xmin, :xmax and the like hashes. The rules
387
# # are the following:
388
# # * ?min/?max are passed on directly;
389
# # * ?e(abs) are transformed into ?min = ? - ?eabs, ?max = ? + ?eabs
390
# # * ?eu(p/?ed(own) are transformed respectively into ? +/- ?...
391
# # * ?er(el) become ?min = ?*(1 - ?erel, ?max = ?(1 + ?erel)
392
# # * ?erup/?erdown follow the same pattern...
393
# def compute_error_bars(values)
395
# for key in values.keys
397
# when /^[xy](min|max)?$/
398
# target[key] = values[key].dup # Just to make sure.
399
# when /^(.)e(a(bs?)?)?$/
400
# target["#{$1}min".to_sym] = values[$1.to_sym] - values[key]
401
# target["#{$1}max".to_sym] = values[$1.to_sym] + values[key]
403
# target["#{$1}max".to_sym] = values[$1.to_sym] + values[key]
404
# when /^(.)ed(o(wn?)?)?$/
405
# target["#{$1}min".to_sym] = values[$1.to_sym] - values[key]
406
# when /^(.)er(el?)?$/
407
# target["#{$1}min".to_sym] = values[$1.to_sym] *
408
# (values[key].neg + 1)
409
# target["#{$1}max".to_sym] = values[$1.to_sym] *
411
# when /^(.)erd(o(wn?)?)?$/
412
# target["#{$1}min".to_sym] = values[$1.to_sym] *
413
# (values[key].neg + 1)
415
# target["#{$1}max".to_sym] = values[$1.to_sym] *
418
# warn "Somehow, the target specification #{key} " +
419
# "didn't make it through"