~stefanor/ibid/translation-334764

« back to all changes in this revision

Viewing changes to ibid/plugins/google.py

  • Committer: Stefano Rivera
  • Date: 2009-03-05 15:04:46 UTC
  • mfrom: (557.1.8 google-api-336419)
  • Revision ID: stefano@rivera.za.net-20090305150446-azo5kfq0g8v2myod
Reworked google plugin to use Google Ajax API where possible.
https://code.edge.launchpad.net/~stefanor/ibid/google-api-336419/+merge/4120

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
import htmlentitydefs
 
2
import re
 
3
import simplejson
1
4
from urllib import quote
2
5
from urllib2 import urlopen, Request
 
6
 
3
7
from BeautifulSoup import BeautifulSoup
4
8
 
5
9
from ibid.plugins import Processor, match
6
10
from ibid.config import Option
 
11
from ibid.utils import ibid_version
7
12
 
8
13
help = {'google': u'Retrieves results from Google and Google Calculator.'}
9
14
 
10
 
user_agent = 'Mozilla/5.0'
11
 
 
12
 
class Search(Processor):
13
 
    u"""google [for] <term>"""
14
 
    feature = 'google'
15
 
 
16
 
    user_agent = Option('user_agent', 'HTTP user agent to present to Google', user_agent)
17
 
 
18
 
    @match(r'^google\s+(?:(za)\s+)?(?:for\s+)?(.+?)$')
19
 
    def search(self, event, country, query):
20
 
        url = 'http://www.google.com/search?num=3&q=%s' % quote(query)
 
15
default_user_agent = 'Mozilla/5.0'
 
16
default_referrer = "http://ibid.omnia.za.net/"
 
17
 
 
18
def de_entity(text):
 
19
    "Remove HTML entities, and replace with their characters"
 
20
    replace = lambda match: unichr(int(match.group(1)))
 
21
    text = re.sub("&#(\d+);", replace, text)
 
22
 
 
23
    replace = lambda match: unichr(htmlentitydefs.name2codepoint[match.group(1)])
 
24
    text = re.sub("&(\w+);", replace, text)
 
25
    return text
 
26
 
 
27
class GoogleAPISearch(Processor):
 
28
    u"""google [for] <term>
 
29
    googlefight [for] <term> and <term>"""
 
30
 
 
31
    feature = 'google'
 
32
 
 
33
    api_key = Option('api_key', 'Your Google API Key (optional)', None)
 
34
    referrer = Option('referrer', 'The referrer string to use (API searches)', default_referrer)
 
35
 
 
36
    google_api_url = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s"
 
37
 
 
38
    def _google_api_search(self, query, resultsize="large"):
 
39
        url = self.google_api_url % quote(query)
 
40
        url += "&rsz=%s" % resultsize
 
41
        if self.api_key:
 
42
            url += '&key=%s' % quote(key)
 
43
        req = Request(url, headers={
 
44
            'user-agent': "Ibid/%s" % ibid_version() or "dev",
 
45
            'referrer': self.referrer,
 
46
        })
 
47
        f = urlopen(req)
 
48
        result = f.read()
 
49
        f.close()
 
50
        result = simplejson.loads(result)
 
51
        return result
 
52
 
 
53
    @match(r'^google\s+(?:for\s+)?(.+?)$')
 
54
    def search(self, event, query):
 
55
        items = self._google_api_search(query)
 
56
        results = []
 
57
        for item in items["responseData"]["results"]:
 
58
 
 
59
            title = item["titleNoFormatting"]
 
60
 
 
61
            results.append(u'"%s" %s' % (de_entity(title), item["unescapedUrl"]))
 
62
            
 
63
        if results:
 
64
            event.addresponse(u', '.join(results))
 
65
        else:
 
66
            event.addresponse(u"Wow! Google couldn't find anything.")
 
67
 
 
68
    @match(r'^(?:rank|(?:google(?:fight|compare|cmp)))\s+(?:for\s+)?(.+?)\s+and\s+(.+?)$')
 
69
    def googlefight(self, event, term1, term2):
 
70
        count1 = int(self._google_api_search(term1, "small")["responseData"]["cursor"].get("estimatedResultCount", 0))
 
71
        count2 = int(self._google_api_search(term2, "small")["responseData"]["cursor"].get("estimatedResultCount", 0))
 
72
        event.addresponse(u'%s wins with %i hits, %s had %i hits' % 
 
73
            (count1 > count2 and (term1, count1, term2, count2) or (term2, count2, term1, count1))
 
74
        )
 
75
 
 
76
# Unfortunatly google API search doesn't support all of google search's
 
77
# features.
 
78
# Dear Google: We promise we don't bite.
 
79
class GoogleScrapeSearch(Processor):
 
80
    u"""gcalc <expression>
 
81
    gdefine <term>
 
82
    google.<TLD> [for] <terms>"""
 
83
 
 
84
    feature = 'google'
 
85
 
 
86
    user_agent = Option('user_agent', 'HTTP user agent to present to Google (for non-API searches)', default_user_agent)
 
87
    google_scrape_url = "http://www.google.com/search?q=%s"
 
88
 
 
89
    def _google_scrape_search(self, query, country=None):
 
90
        url = self.google_scrape_url
21
91
        if country:
22
 
            url = url + '&meta=cr%%3Dcountry%s' % country.upper()
23
 
 
24
 
        f = urlopen(Request(url, headers={'user-agent': self.user_agent}))
 
92
            url += "&cr=country%s" % country.upper()
 
93
        f = urlopen(Request(url % quote(query), headers={'user-agent': self.user_agent}))
25
94
        soup = BeautifulSoup(f.read())
26
95
        f.close()
27
 
 
28
 
        results = []
29
 
        items = soup.findAll('li')[:10]
30
 
        for item in items:
31
 
            try:
32
 
                url = item.a['href']
33
 
                title = u''.join([e.string for e in item.a.contents])
34
 
                results.append(u'"%s" %s' % (title, url))
35
 
            except Exception:
36
 
                pass
37
 
 
38
 
        event.addresponse(u', '.join(results))
39
 
 
40
 
class Calc(Processor):
41
 
    u"""gcalc <expression>"""
42
 
    feature = 'google'
43
 
 
44
 
    user_agent = Option('user_agent', 'HTTP user agent to present to Google', user_agent)
 
96
        return soup
45
97
 
46
98
    @match(r'^gcalc\s+(.+)$')
47
99
    def calc(self, event, expression):
48
 
        f = urlopen(Request('http://www.google.com/search?num=1&q=%s' % quote(expression), headers={'user-agent': self.user_agent}))
49
 
        soup = BeautifulSoup(f.read())
50
 
        f.close()
 
100
        soup = self._google_scrape_search(expression)
51
101
 
52
102
        font = soup.find('font', size='+1')
53
103
        if not font:
55
105
        else:
56
106
            event.addresponse(font.b.string)
57
107
 
58
 
class Define(Processor):
59
 
    u"""gdefine <term>"""
60
 
    feature = 'google'
61
 
 
62
 
    user_agent = Option('user_agent', 'HTTP user agent to present to Google', user_agent)
63
 
 
64
108
    @match(r'^gdefine\s+(.+)$')
65
109
    def define(self, event, term):
66
 
        f = urlopen(Request('http://www.google.com/search?num=1&q=define:%s' % quote(term), headers={'user-agent': self.user_agent}))
67
 
        soup = BeautifulSoup(f.read())
68
 
        f.close()
 
110
        soup = self._google_scrape_search("define:%s" % term)
69
111
 
70
112
        definitions = []
71
113
        for li in soup.findAll('li'):
72
 
            definitions.append('"%s"' % li.contents[0].strip())
 
114
            definitions.append(de_entity(li.contents[0].strip()))
73
115
 
74
116
        if definitions:
75
 
            event.addresponse(', '.join(definitions))
 
117
            event.addresponse(u' :: '.join(definitions))
76
118
        else:
77
119
            event.addresponse(u"Are you making up words again?")
78
120
 
79
 
class Compare(Processor):
80
 
    u"""google cmp [for] <term> and <term>"""
81
 
    feature = 'google'
82
 
 
83
 
    user_agent = Option('user_agent', 'HTTP user agent to present to Google', user_agent)
84
 
 
85
 
 
86
 
    def results(self, term):
87
 
        f = urlopen(Request('http://www.google.com/search?num=1&q=%s' % quote(term), headers={'user-agent': self.user_agent}))
88
 
        soup = BeautifulSoup(f.read())
89
 
        f.close()
90
 
 
91
 
        noresults = soup.findAll('div', attrs={'class': 'med'})
92
 
        if noresults and len(noresults) > 1 and noresults[1].find('did not match any documents') != -1:
93
 
            return 0
 
121
    # Not supported by Google API: http://code.google.com/p/google-ajax-apis/issues/detail?id=24
 
122
    @match(r'^google(?:\.com?)?\.([a-z]{2})(?:\s+for)?\s+(.*)$')
 
123
    def country_search(self, event, country, terms):
 
124
        soup = self._google_scrape_search(terms, country)
 
125
 
 
126
        results = []
 
127
        items = soup.findAll('li')
 
128
        for item in items:
 
129
            try:
 
130
                url = item.a['href']
 
131
                title = u''.join([e.string for e in item.a.contents])
 
132
                if title.startswith("Image results for"):
 
133
                    continue
 
134
                results.append(u'"%s" %s' % (de_entity(title), url))
 
135
            except Exception:
 
136
                pass
 
137
            if len(results) >= 8:
 
138
                break
 
139
 
 
140
        if results:
 
141
            event.addresponse(u", ".join(results))
94
142
        else:
95
 
            results = soup.find('div', id='prs').nextSibling.contents[5].string.replace(',', '')
96
 
            if results:
97
 
                return int(results)
98
 
 
99
 
    @match(r'^google\s+cmp\s+(?:for\s+)?(.+?)\s+and\s+(.+?)$')
100
 
    def compare(self, event, term1, term2):
101
 
        count1 = self.results(term1)
102
 
        count2 = self.results(term2)
103
 
        event.addresponse(u'%s wins with %s hits, %s had %s hits' % (count1 > count2 and term1 or term2, count1 > count2 and count1 or count2, count1 > count2 and term2 or term1, count1 > count2 and count2 or count1))
 
143
            event.addresponse(u"Wow! Google couldn't find anything.")
104
144
 
105
145
# vi: set et sta sw=4 ts=4: