3
#Copyright (c) 2012 Ben Gimpert, http://blog.someben.com/
5
#Permission is hereby granted, free of charge, to any person obtaining a
6
#copy of this software and associated documentation files (the "Software"),
7
#to deal in the Software without restriction, including without limitation
8
#the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
#and/or sell copies of the Software, and to permit persons to whom the
10
#Software is furnished to do so, subject to the following conditions:
12
#The above copyright notice and this permission notice shall be included in
13
#all copies or substantial portions of the Software.
15
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
#THE X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19
#WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
20
#OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
#Except as contained in this notice, the name of the Ben Gimpert shall not
24
#be used in advertising or otherwise to promote the sale, use or other dealings
25
#in this Software without prior written authorization from Ben Gimpert.
31
OptionParser.new do |opts|
32
opts.banner = <<EOF_BANNER
33
*** Vowpal Wabbit Regression Utilities ***
34
Usage: #{File.basename(__FILE__, ".rb")} [options]
38
opts.on("-v", "--ex_val_desc", "Describe the distribution of example labels (to STDERR)") do |opt_val|
39
opt_map[:ex_val_desc] = true
41
opts.on("-L", "--vals", "Output only example labels") do |opt_val|
42
opt_map[:labels] = true
44
opts.on("-x", "--min_ex_val=X", "Output only those examples with labels greater than or equal to X") do |opt_val|
45
opt_map[:min_ex_val] = opt_val.to_f
47
opts.on("-X", "--max_ex_val=X", "Output only those examples with labels less than or equal to X") do |opt_val|
48
opt_map[:max_ex_val] = opt_val.to_f
50
opts.on("-n", "--min_num_feats=X", "Output only those examples with no less than X features across all namespaces") do |opt_val|
51
opt_map[:min_num_feats] = opt_val.to_i
53
opts.on("-N", "--max_num_feats=X", "Output only those examples with no more than X features across all namespaces") do |opt_val|
54
opt_map[:max_num_feats] = opt_val.to_i
56
opts.on("-p", "--pos_ex_val_imp=X", "Set importance of examples with positive labels to X") do |opt_val|
57
opt_map[:pos_ex_val_imp] = opt_val.to_f
59
opts.on("-P", "--neg_ex_val_imp=X", "Set importance of examples with negative labels to X") do |opt_val|
60
opt_map[:neg_ex_val_imp] = opt_val.to_f
62
opts.on("-c", "--to_class", "Convert positive (negative) example labels to +1 (-1) labels") do |opt_val|
63
opt_map[:to_class] = true
70
next if ex_line.to_s.strip.empty?
71
ex_line_pipe_segs = ex_line.split("|")
72
ex_label_seg = ex_line_pipe_segs.first
73
ex_label, ex_imp, ex_init_pred, ex_tag = ex_label_seg.split(" ")
75
if opt_map.has_key?(:ex_val_desc)
76
ex_labels = [] if ex_labels.nil?
77
ex_labels << ex_label.to_f
80
if opt_map.has_key?(:labels)
85
if opt_map.has_key?(:min_ex_val)
86
next if ex_label.to_f < opt_map[:min_ex_val]
88
if opt_map.has_key?(:max_ex_val)
89
next if ex_label.to_f > opt_map[:max_ex_val]
92
if opt_map.has_key?(:min_num_feats) || opt_map.has_key?(:max_num_feats)
94
ex_line_pipe_segs[1..-1].each do |ex_line_feature_seg|
95
num_features = ex_line_feature_seg.split(" ").length - 1 # -1 for the namespace
97
if opt_map.has_key?(:min_num_feats)
98
next if num_features < opt_map[:min_num_feats]
99
else # if opt_map.has_key?(:max_num_feats)
100
next if num_features > opt_map[:max_num_feats]
104
if opt_map.has_key?(:pos_ex_val_imp) && (ex_label.to_f > 0)
105
ex_imp = opt_map[:pos_ex_val_imp]
106
elsif opt_map.has_key?(:neg_ex_val_imp) && (ex_label.to_f < 0)
107
ex_imp = opt_map[:neg_ex_val_imp]
110
if opt_map.has_key?(:to_class)
111
ex_label = (ex_label.to_f > 0) ? +1 : -1
114
new_ex_line = ex_label.to_s
115
new_ex_line += " #{ex_imp}" unless ex_imp.nil?
116
new_ex_line += " #{ex_init_pred}" unless ex_init_pred.nil?
117
new_ex_line += " #{ex_tag}" unless ex_tag.nil?
118
new_ex_line += " |" + ex_line_pipe_segs[1..-1].join("|")
119
$stdout.puts new_ex_line
122
if opt_map.has_key?(:ex_val_desc)
123
pos_count = ex_labels.select { |ex_label| ex_label > 0 }.length
124
neg_count = ex_labels.select { |ex_label| ex_label < 0 }.length
127
ex_labels.each { |ex_label| av_ex_label += ex_label }
128
av_ex_label /= ex_labels.length
131
ex_labels.each { |ex_label| sd_ex_label += (ex_label - av_ex_label) ** 2 }
132
sd_ex_label /= ex_labels.length
133
sd_ex_label = sd_ex_label ** 0.5
135
$stderr.puts <<EOF_EX_VAL_DESC
136
ExLabelCount=#{ex_labels.length}
137
ExLabelPosCount=#{pos_count}
138
ExLabelNegCount=#{neg_count}
139
ExLabelAverage=#{av_ex_label}
140
ExLabelStDev=#{sd_ex_label}