~rhcarvalho/+junk/racket

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#lang racket
;; Explorations with decoding URLs %-encoded in Latin-1 rather than UTF-8
;;
;; Rodolfo Carvalho

(require net/url)

;; Portions of code taken from uri-codec-unit.rkt from the net/uri-codec collect.

;; ------------------------------------------------------------------

(define (self-map-char ch) (cons ch ch))
(define (self-map-chars str) (map self-map-char (string->list str)))

;; The characters that always map to themselves
(define alphanumeric-mapping
  (self-map-chars
   "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"))

;; Characters that sometimes map to themselves
;; called 'mark' in RFC 3986
(define safe-mapping (self-map-chars "-_.!~*'()"))

;; The strict URI mapping
(define uri-mapping (append alphanumeric-mapping safe-mapping))

;; ------------------------------------------------------------------

(define (number->hex-string number)
  (define (hex n) (string-ref "0123456789ABCDEF" n))
  (string #\% (hex (quotient number 16)) (hex (modulo number 16))))

(define (hex-string->number hex-string)
  (string->number (substring hex-string 1 3) 16))

(define ascii-size 128)

;; ------------------------------------------------------------------


;; (listof (cons char char)) -> (values (vectorof string) (vectorof string))
(define (make-codec-tables alist)
  (let ([encoding-table (build-vector ascii-size number->hex-string)]
        [decoding-table (build-vector ascii-size values)])
    (for-each (match-lambda
               [(cons orig enc)
                (vector-set! encoding-table
                             (char->integer orig)
                             (string enc))
                (vector-set! decoding-table
                             (char->integer enc)
                             (char->integer orig))])
              alist)
    (values encoding-table decoding-table)))

(define-values (uri-encoding-vector uri-decoding-vector)
  (make-codec-tables uri-mapping))

;; ------------------------------------------------------------------

;; vector string -> string
(define (decode table str)
  (define internal-decode
    (match-lambda [(list) (list)]
                  [(list* #\% (? hex-digit? char1) (? hex-digit? char2) rest)
                   ;; This used to consult the table again, but I think that's
                   ;;  wrong. For example %2b should produce +, not a space.
                   (cons (string->number (string char1 char2) 16)
                         (internal-decode rest))]
                  [(cons (? ascii-char? char) rest)
                   (cons (vector-ref table (char->integer char))
                         (internal-decode rest))]
                  [(cons char rest)
                   (append
                    (bytes->list (string->bytes/utf-8 (string char)))
                    (internal-decode rest))]))
  (bytes->string/latin-1 (apply bytes (internal-decode (string->list str)))))


(define (ascii-char? c)
  (< (char->integer c) ascii-size))

(define (hex-digit? c)
  (or (char<=? #\0 c #\9)
      (char<=? #\a c #\f)
      (char<=? #\A c #\F)))


;; string -> string
(define (uri-decode str)
  (decode uri-decoding-vector str))

;; ------------------------------------------------------------------

(define (force-utf-8/bytes bstr)
  (define c (bytes-open-converter "ISO-8859-1" "UTF-8"))
  (define-values (result-bstr src-read-amt status) (bytes-convert c bstr))
  (bytes-close-converter c)
  result-bstr)

(define (force-utf-8/string str)
  (define in (open-input-string str))
  (reencode-input-port in "ISO-8859-1")
  (port->string in))

(define current-url (string->url "http://www.ufrj.br"))
(define resource "http://www.ufrj.br/editais.php?tp=Acad%EAmicos&no=Cursos&idtp=4")
;(combine-url/relative current-url resource)
(uri-decode resource)