3
# get, parse and enrich heise rss feeds
5
# call with the feed specified you like to retrieve. Currently supported:
7
# news - heise newsticker
8
# teleopils - Telepolis
9
# security - heise security news
13
# 26.06.2009 erb suppressed error messages due to unrepsonsive servers
19
require 'rexml/document'
25
#try to retrieve web site, following up to 5 redirects
26
def geturl(url, depth=5)
27
raise ArgumentError, 'Followed more 4 redirections. Stopping this nightmare now.' if depth == 0
28
response = Net::HTTP.get_response(URI.parse(url))
30
when Net::HTTPSuccess then response.body
31
when Net::HTTPRedirection then geturl(response['location'], depth-1) # follow redirection
37
if ENV['http_proxy'].nil? && !ENV['HTTP_PROXY'].nil?
38
ENV['http_proxy'] = ENV['HTTP_PROXY']
42
FEEDS = { "news" => "http://www.heise.de/newsticker/heise.rdf",
43
"telepolis" => "http://www.heise.de/tp/news.rdf",
44
"security" => "http://www.heise.de/security/news/news.rdf"
47
GOOGLEON="<!--googleon: index-->"
48
GOOGLEOFF="<!--googleoff: index-->"
51
FEEDS.each_key { |k| print " #{k}\n" }
55
print "usage: #{File::basename($0)} <feed>\n"
56
print "<feed> is one of\n"
63
unless FEEDS.has_key?(feed)
64
print "unknown feed '#{feed}'. Use one of these:\n"
75
Timeout::timeout(15) do
77
feed_text = f.read unless f.nil?
86
exit 2 if feed_text.length < 20
88
#print "Got this feed: ", feed_text, "\n"; STDOUT.flush
90
xml = Document.new(feed_text)
93
xml.elements.each("//item") do |item|
94
# extract link to article
95
article_url = item.elements['link'].text
96
article_url.sub!(%r{from/rss.*$}, "")
97
article_short_url = article_url.sub(%r{/[^/]*--/}, "/")
99
# get full text for article
103
# print "<!-- Reading article from ", article_url, " -->\n"; STDOUT.flush
104
Timeout::timeout(15) do
105
article = open(article_url)
106
article_text = article.read unless article.nil?
108
rescue Timeout::Error
115
next if article_text.length < 20
117
article_text.gsub!(/<!\[CDATA\[/, "")
118
article_text.gsub!(/\]\]>/, "")
120
# now, heise speciality: get everything between GOOGLEON and GOOGLEOFF patterns :-)
121
p1 = article_text.index(GOOGLEON)
122
p2 = article_text.index(GOOGLEOFF)
126
while(pos < article_text.length) do
127
p1 = article_text.index(GOOGLEON, pos)
129
p2 = article_text.index(GOOGLEOFF, pos)
130
p2 = article_text.length unless p2
132
result += article_text[p1+GOOGLEON.length..p2-1]
133
pos=p2+GOOGLEOFF.length
135
pos=p1+GOOGLEON.length
138
article_text = result
141
# get rid of comments and other annoying artifacts
142
article_text.gsub!(/<!--LINK_ICON--><img[^>]*><!--\/LINK_ICON-->/m, " ")
143
article_text.gsub!(/<!--[^>]*-->/, "")
144
article_text.gsub!(/\s+/m, " ")
146
# insert full text article into feed
147
description = Element.new("description")
148
description.text= CData.new(article_text)
149
item.add_element(description)
151
guid = Element.new("guid")
152
guid.text= article_short_url
153
item.add_element(guid)
156
#reproduce enriched feed
157
xml.write($stdout, -1)