~robru/gwibber/foursquare-fixup

« back to all changes in this revision

Viewing changes to gwibber/gwibber/utils/download.py

Committer: Barry Warsaw
Date: 2012-09-14 20:29:11 UTC
Revision ID: barry@python.org-20120914202911-pyvq07821iqua9k8

Full support for RFC 4627 application/json implicit charset encodings. See $3
of the RFC for details.

files added:
gwibber/gwibber/tests/data/json-utf-16be.dat

gwibber/gwibber/tests/data/json-utf-16le.dat

gwibber/gwibber/tests/data/json-utf-32be.dat

gwibber/gwibber/tests/data/json-utf-32le.dat

gwibber/gwibber/tests/data/json-utf-8.dat

files modified:
gwibber/gwibber/testing/mocks.py

gwibber/gwibber/tests/test_download.py

gwibber/gwibber/utils/download.py

Show diffs side-by-side

added added

removed removed

gwibber/gwibber/utils/download.py

def get_json(self):

"""Interpret and return the results as JSON data."""

return json.loads(self.get_string())

request = self._download()

with urlopen(request) as result:

payload = result.read()

info = result.info()

# RFC 4627 $3. JSON text SHALL be encoded in Unicode. The default

# encoding is UTF-8. Since the first two characters of a JSON text

# will always be ASCII characters [RFC0020], it is possible to

# determine whether an octet stream is UTF-8, UTF-16 (BE or LE), or

# UTF-32 (BE or LE) by looking at the pattern of nulls in the first

# four octets.

charset = info.get_content_charset()

if charset is None:

100

octet_0, octet_1, octet_2, octet_3 = payload[:4]

101

if 0 not in (octet_0, octet_1, octet_2, octet_3):

102

charset = 'utf-8'

103

elif (octet_1 == octet_3 == 0) and octet_2 != 0:

104

charset = 'utf-16le'

105

elif (octet_0 == octet_2 == 0) and octet_1 != 0:

106

charset = 'utf-16be'

107

elif (octet_1 == octet_2 == octet_3 == 0):

108

charset = 'utf-32le'

109

elif (octet_0 == octet_1 == octet_2 == 0):

110

charset = 'utf-32be'

111

return json.loads(payload.decode(charset))

112

113

114

def get_json(*args, **kws):

Older »