~ubuntu-branches/ubuntu/maverick/newsbeuter/maverick

« back to all changes in this revision

Viewing changes to contrib/heise.rb

Committer: Bazaar Package Importer
Author(s): Nico Golde
Date: 2009-12-08 21:20:22 UTC
mfrom: (4.1.8 sid)
Revision ID: james.westby@ubuntu.com-20091208212022-agupa2xecomowov8

Tags: 2.1-1

http://bugs.debian.org/526612

http://bugs.debian.org/543591

* New upstream release
- fix rss title parsing (Closes: #526612).
- fix FTBFS on kFreeBSD (Closes: #543591).

files added:
.gitignore

contrib/heise.rb

contrib/slashdot.rb

test/testbed/enclosure.xml

files modified:
AUTHORS

CHANGES

Makefile

README

TODO

config.h

config.sh

debian/changelog

doc/chapter-firststeps.txt

doc/configcommands.dsv

doc/example-config

doc/hackers-guide.txt

doc/keycmds.dsv

doc/newsbeuter.1

doc/newsbeuter.txt

doc/podbeuter.1

doc/xhtml/newsbeuter.html

include/controller.h

include/htmlrenderer.h

include/keymap.h

include/rss.h

include/stflpp.h

include/utils.h

include/view.h

po/de.po

po/es_ES.po

po/fr.po

po/hu.po

po/it.po

po/newsbeuter.pot

po/nl.po

po/pl.po

po/pt_BR.po

po/ru.po

po/sv.po

po/tr.po

po/uk.po

po/zh.po

po/zh_TW.po

rss/parser.cpp

rss/parser_factory.cpp

rss/rss_09x_parser.cpp

rss/rsspp.h

src/cache.cpp

src/configcontainer.cpp

src/controller.cpp

src/feedlist_formaction.cpp

src/formaction.cpp

src/htmlrenderer.cpp

src/itemlist_formaction.cpp

src/itemview_formaction.cpp

src/keymap.cpp

src/listformatter.cpp

src/mutex.cpp

src/rss.cpp

src/rss_parser.cpp

src/stflpp.cpp

src/tagsouppullparser.cpp

src/urlreader.cpp

src/utils.cpp

src/view.cpp

stfl/dllist.stfl

stfl/feedlist.stfl

stfl/filebrowser.stfl

stfl/help.stfl

stfl/itemlist.stfl

stfl/itemview.stfl

stfl/selecttag.stfl

stfl/urlview.stfl

test/test-basic-ops.rb

test/test-feedlist.rb

test/test-itemlist.rb

test/test.cpp

Show diffs side-by-side

added added

removed removed

contrib/heise.rb

#!/usr/bin/ruby

# get, parse and enrich heise rss feeds

# call with the feed specified you like to retrieve. Currently supported:

# news - heise newsticker

# teleopils - Telepolis

# security - heise security news

# Change history

# 26.06.2009 erb suppressed error messages due to unrepsonsive servers

require 'net/http'

require 'uri'

require 'rexml/document'

include REXML

require "open-uri"

require 'timeout'

#try to retrieve web site, following up to 5 redirects

def geturl(url, depth=5)

raise ArgumentError, 'Followed more 4 redirections. Stopping this nightmare now.' if depth == 0

response = Net::HTTP.get_response(URI.parse(url))

case response

when Net::HTTPSuccess then response.body

when Net::HTTPRedirection then geturl(response['location'], depth-1) # follow redirection

else

response.error!

end

if ENV['http_proxy'].nil? && !ENV['HTTP_PROXY'].nil?

ENV['http_proxy'] = ENV['HTTP_PROXY']

end

# key feed URL

FEEDS = { "news" => "http://www.heise.de/newsticker/heise.rdf",

"telepolis" => "http://www.heise.de/tp/news.rdf",

"security" => "http://www.heise.de/security/news/news.rdf"

}

GOOGLEON=""

GOOGLEOFF=""

def listFeeds

FEEDS.each_key { |k| print " #{k}\n" }

end

if ARGV.length < 1

print "usage: #{File::basename($0)} <feed>\n"

print "<feed> is one of\n"

listFeeds

exit

end

feed=ARGV[0]

unless FEEDS.has_key?(feed)

print "unknown feed '#{feed}'. Use one of these:\n"

listFeeds

exit

end

feedurl = FEEDS[feed]

#get feed

feed_text = ""

retries=4

begin

Timeout::timeout(15) do

f = open(feedurl)

feed_text = f.read unless f.nil?

end

rescue Timeout::Error

retries -= 1

exit 1 if retries < 1

sleep 1

retry

end

exit 2 if feed_text.length < 20

#print "Got this feed: ", feed_text, "\n"; STDOUT.flush

xml = Document.new(feed_text)

#loop over items

xml.elements.each("//item") do |item|

# extract link to article

article_url = item.elements['link'].text

article_url.sub!(%r{from/rss.*$}, "")

article_short_url = article_url.sub(%r{/[^/]*--/}, "/")

# get full text for article

100

article_text = ""

101

retries = 4

102

begin

103

# print "\n"; STDOUT.flush

104

Timeout::timeout(15) do

105

article = open(article_url)

106

article_text = article.read unless article.nil?

107

end

108

rescue Timeout::Error

109

retries -= 1

110

next if retries < 1

111

sleep 1

112

retry

113

end

114

115

next if article_text.length < 20

116

117

article_text.gsub!(/<!\[CDATA\[/, "")

118

article_text.gsub!(/\]\]>/, "")

119

120

# now, heise speciality: get everything between GOOGLEON and GOOGLEOFF patterns :-)

121

p1 = article_text.index(GOOGLEON)

122

p2 = article_text.index(GOOGLEOFF)

123

if (p1 && p2)

124

result = ""

125

pos = p1

126

while(pos < article_text.length) do

127

p1 = article_text.index(GOOGLEON, pos)

128

break unless p1

129

p2 = article_text.index(GOOGLEOFF, pos)

130

p2 = article_text.length unless p2

131

if p1 < p2

132

result += article_text[p1+GOOGLEON.length..p2-1]

133

pos=p2+GOOGLEOFF.length

134

else

135

pos=p1+GOOGLEON.length

136

end

137

end

138

article_text = result

139

end

140

141

# get rid of comments and other annoying artifacts

142

article_text.gsub!(/<img[^>]*>/m, " ")

143

article_text.gsub!(//, "")

144

article_text.gsub!(/\s+/m, " ")

145

146

# insert full text article into feed

147

description = Element.new("description")

148

description.text= CData.new(article_text)

149

item.add_element(description)

150

151

guid = Element.new("guid")

152

guid.text= article_short_url

153

item.add_element(guid)

154

end

155

156

#reproduce enriched feed

157

xml.write($stdout, -1)

Older »