50.1.6
by Rodolfo Carvalho
Add today's experiments. |
1 |
#lang racket |
2 |
;; Explorations with decoding URLs %-encoded in Latin-1 rather than UTF-8
|
|
3 |
;;
|
|
4 |
;; Rodolfo Carvalho
|
|
5 |
||
6 |
(require net/url) |
|
7 |
||
8 |
;; Portions of code taken from uri-codec-unit.rkt from the net/uri-codec collect.
|
|
9 |
||
10 |
;; ------------------------------------------------------------------
|
|
11 |
||
12 |
(define (self-map-char ch) (cons ch ch)) |
|
13 |
(define (self-map-chars str) (map self-map-char (string->list str))) |
|
14 |
||
15 |
;; The characters that always map to themselves
|
|
16 |
(define alphanumeric-mapping |
|
17 |
(self-map-chars |
|
18 |
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")) |
|
19 |
||
20 |
;; Characters that sometimes map to themselves
|
|
21 |
;; called 'mark' in RFC 3986
|
|
22 |
(define safe-mapping (self-map-chars "-_.!~*'()")) |
|
23 |
||
24 |
;; The strict URI mapping
|
|
25 |
(define uri-mapping (append alphanumeric-mapping safe-mapping)) |
|
26 |
||
27 |
;; ------------------------------------------------------------------
|
|
28 |
||
29 |
(define (number->hex-string number) |
|
30 |
(define (hex n) (string-ref "0123456789ABCDEF" n)) |
|
31 |
(string #\% (hex (quotient number 16)) (hex (modulo number 16)))) |
|
32 |
||
33 |
(define (hex-string->number hex-string) |
|
34 |
(string->number (substring hex-string 1 3) 16)) |
|
35 |
||
36 |
(define ascii-size 128) |
|
37 |
||
38 |
;; ------------------------------------------------------------------
|
|
39 |
||
40 |
||
41 |
;; (listof (cons char char)) -> (values (vectorof string) (vectorof string))
|
|
42 |
(define (make-codec-tables alist) |
|
43 |
(let ([encoding-table (build-vector ascii-size number->hex-string)] |
|
44 |
[decoding-table (build-vector ascii-size values)]) |
|
45 |
(for-each (match-lambda |
|
46 |
[(cons orig enc) |
|
47 |
(vector-set! encoding-table |
|
48 |
(char->integer orig) |
|
49 |
(string enc)) |
|
50 |
(vector-set! decoding-table |
|
51 |
(char->integer enc) |
|
52 |
(char->integer orig))]) |
|
53 |
alist) |
|
54 |
(values encoding-table decoding-table))) |
|
55 |
||
56 |
(define-values (uri-encoding-vector uri-decoding-vector) |
|
57 |
(make-codec-tables uri-mapping)) |
|
58 |
||
59 |
;; ------------------------------------------------------------------
|
|
60 |
||
61 |
;; vector string -> string
|
|
62 |
(define (decode table str) |
|
63 |
(define internal-decode |
|
64 |
(match-lambda [(list) (list)] |
|
65 |
[(list* #\% (? hex-digit? char1) (? hex-digit? char2) rest) |
|
66 |
;; This used to consult the table again, but I think that's
|
|
67 |
;; wrong. For example %2b should produce +, not a space.
|
|
68 |
(cons (string->number (string char1 char2) 16) |
|
69 |
(internal-decode rest))] |
|
70 |
[(cons (? ascii-char? char) rest) |
|
71 |
(cons (vector-ref table (char->integer char)) |
|
72 |
(internal-decode rest))] |
|
73 |
[(cons char rest) |
|
74 |
(append |
|
75 |
(bytes->list (string->bytes/utf-8 (string char))) |
|
76 |
(internal-decode rest))])) |
|
77 |
(bytes->string/latin-1 (apply bytes (internal-decode (string->list str))))) |
|
78 |
||
79 |
||
80 |
(define (ascii-char? c) |
|
81 |
(< (char->integer c) ascii-size)) |
|
82 |
||
83 |
(define (hex-digit? c) |
|
84 |
(or (char<=? #\0 c #\9) |
|
85 |
(char<=? #\a c #\f) |
|
86 |
(char<=? #\A c #\F))) |
|
87 |
||
88 |
||
89 |
;; string -> string
|
|
90 |
(define (uri-decode str) |
|
91 |
(decode uri-decoding-vector str)) |
|
92 |
||
93 |
;; ------------------------------------------------------------------
|
|
94 |
||
95 |
(define (force-utf-8/bytes bstr) |
|
96 |
(define c (bytes-open-converter "ISO-8859-1" "UTF-8")) |
|
97 |
(define-values (result-bstr src-read-amt status) (bytes-convert c bstr)) |
|
98 |
(bytes-close-converter c) |
|
99 |
result-bstr) |
|
100 |
||
101 |
(define (force-utf-8/string str) |
|
102 |
(define in (open-input-string str)) |
|
103 |
(reencode-input-port in "ISO-8859-1") |
|
104 |
(port->string in)) |
|
105 |
||
106 |
(define current-url (string->url "http://www.ufrj.br")) |
|
107 |
(define resource "http://www.ufrj.br/editais.php?tp=Acad%EAmicos&no=Cursos&idtp=4") |
|
108 |
;(combine-url/relative current-url resource)
|
|
109 |
(uri-decode resource) |