3
# Split a fasta file into different files
4
# Stefan Kurtz, October 10, 2009.
6
# first parameter is prefix of files to generate.
7
# second parameter is number of files to generate, 0 means to generate
8
# one file per sequence
9
# third parameter is input file.
11
# let infile be the name of the input file in fasta format. Suppose
12
# that it contain 1952 sequences. Then
14
# splitmultifasta.rb tmp 0 infile
16
# generates 1952 files named tmp-0000, tmp-0001, ..., tmp-1951.
17
# each containing one sequence
18
# To check that the split was correct, execute the following commands:
21
# when nothing is reported by diff, then everything is fine. Otherwise
22
# check if there are other (not generated files) that begin with
25
def countnumofsequences(inputfile)
27
File.open(inputfile).each_line do |line|
35
def openoutfile(filename)
37
outfp = File.new(filename,"w")
39
STDERR.puts "#{$0}: cannot open \"#{filename}\": #{error}"
46
return (Math.log(n.to_f)/Math.log(10.0)).to_i
49
def splitfiles(inputfile,splitprefix,numoffiles,numofsequences)
58
numwidth = 1+log10func(numofsequences-1)
60
maxseqnum = numofsequences/numoffiles + numofsequences % numoffiles
61
numwidth = 1+log10func(numoffiles-1)
63
File.open(inputfile).each_line do |line|
64
if line.match(/^\s*$/) # discard blank line
66
elsif line.match(/^\s*#/) # discard comment line
68
elsif line.match(/^>/)
69
if seqcount >= maxseqnum
73
outfilename = sprintf("%s-%0*d",splitprefix,numwidth,filenum)
74
fh = openoutfile(outfilename)
84
STDERR.puts "Usage: #{$0} <splitprefix> <numoffiles> <fastafile>"
89
numoffiles = ARGV[1].to_i
92
numofsequences = countnumofsequences(inputfile)
93
splitfiles(inputfile,splitprefix,numoffiles,numofsequences)