2
;; Explorations with decoding URLs %-encoded in Latin-1 rather than UTF-8
8
;; Portions of code taken from uri-codec-unit.rkt from the net/uri-codec collect.
10
;; ------------------------------------------------------------------
12
(define (self-map-char ch) (cons ch ch))
13
(define (self-map-chars str) (map self-map-char (string->list str)))
15
;; The characters that always map to themselves
16
(define alphanumeric-mapping
18
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"))
20
;; Characters that sometimes map to themselves
21
;; called 'mark' in RFC 3986
22
(define safe-mapping (self-map-chars "-_.!~*'()"))
24
;; The strict URI mapping
25
(define uri-mapping (append alphanumeric-mapping safe-mapping))
27
;; ------------------------------------------------------------------
29
(define (number->hex-string number)
30
(define (hex n) (string-ref "0123456789ABCDEF" n))
31
(string #\% (hex (quotient number 16)) (hex (modulo number 16))))
33
(define (hex-string->number hex-string)
34
(string->number (substring hex-string 1 3) 16))
36
(define ascii-size 128)
38
;; ------------------------------------------------------------------
41
;; (listof (cons char char)) -> (values (vectorof string) (vectorof string))
42
(define (make-codec-tables alist)
43
(let ([encoding-table (build-vector ascii-size number->hex-string)]
44
[decoding-table (build-vector ascii-size values)])
45
(for-each (match-lambda
47
(vector-set! encoding-table
50
(vector-set! decoding-table
52
(char->integer orig))])
54
(values encoding-table decoding-table)))
56
(define-values (uri-encoding-vector uri-decoding-vector)
57
(make-codec-tables uri-mapping))
59
;; ------------------------------------------------------------------
61
;; vector string -> string
62
(define (decode table str)
63
(define internal-decode
64
(match-lambda [(list) (list)]
65
[(list* #\% (? hex-digit? char1) (? hex-digit? char2) rest)
66
;; This used to consult the table again, but I think that's
67
;; wrong. For example %2b should produce +, not a space.
68
(cons (string->number (string char1 char2) 16)
69
(internal-decode rest))]
70
[(cons (? ascii-char? char) rest)
71
(cons (vector-ref table (char->integer char))
72
(internal-decode rest))]
75
(bytes->list (string->bytes/utf-8 (string char)))
76
(internal-decode rest))]))
77
(bytes->string/latin-1 (apply bytes (internal-decode (string->list str)))))
80
(define (ascii-char? c)
81
(< (char->integer c) ascii-size))
83
(define (hex-digit? c)
84
(or (char<=? #\0 c #\9)
90
(define (uri-decode str)
91
(decode uri-decoding-vector str))
93
;; ------------------------------------------------------------------
95
(define (force-utf-8/bytes bstr)
96
(define c (bytes-open-converter "ISO-8859-1" "UTF-8"))
97
(define-values (result-bstr src-read-amt status) (bytes-convert c bstr))
98
(bytes-close-converter c)
101
(define (force-utf-8/string str)
102
(define in (open-input-string str))
103
(reencode-input-port in "ISO-8859-1")
106
(define current-url (string->url "http://www.ufrj.br"))
107
(define resource "http://www.ufrj.br/editais.php?tp=Acad%EAmicos&no=Cursos&idtp=4")
108
;(combine-url/relative current-url resource)
109
(uri-decode resource)
b'\\ No newline at end of file'