3
Copyright (c) 2008 by Genome Research Ltd (GRL).
4
2010 by Attractive Chaos <attractor@live.co.uk>
6
Permission is hereby granted, free of charge, to any person obtaining
7
a copy of this software and associated documentation files (the
8
"Software"), to deal in the Software without restriction, including
9
without limitation the rights to use, copy, modify, merge, publish,
10
distribute, sublicense, and/or sell copies of the Software, and to
11
permit persons to whom the Software is furnished to do so, subject to
12
the following conditions:
14
The above copyright notice and this permission notice shall be
15
included in all copies or substantial portions of the Software.
17
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
/* Probably I will not do socket programming in the next few years and
28
therefore I decide to heavily annotate this file, for Linux and
29
Windows as well. -ac */
38
#include <sys/types.h>
42
#include <arpa/inet.h>
43
#include <sys/socket.h>
48
/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
49
* u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
50
* integer -1. In knetfile.c, I use "int" for socket type
51
* throughout. This should be improved to avoid confusion.
53
* In Linux/Mac, recv() and read() do almost the same thing. You can see
54
* in the header file that netread() is simply an alias of read(). In
55
* Windows, however, they are different and using recv() is mandatory.
58
/* This function tests if the file handler is ready for reading (or
59
* writing if is_read==0). */
60
static int socket_wait(int fd, int is_read)
62
fd_set fds, *fdr = 0, *fdw = 0;
65
tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
68
if (is_read) fdr = &fds;
70
ret = select(fd+1, fdr, fdw, 0, &tv);
72
if (ret == -1) perror("select");
75
fprintf(stderr, "select time-out\n");
76
else if (ret == SOCKET_ERROR)
77
fprintf(stderr, "select: %d\n", WSAGetLastError());
83
/* This function does not work with Windows due to the lack of
84
* getaddrinfo() in winsock. It is addapted from an example in "Beej's
85
* Guide to Network Programming" (http://beej.us/guide/bgnet/). */
86
static int socket_connect(const char *host, const char *port)
88
#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
91
struct linger lng = { 0, 0 };
92
struct addrinfo hints, *res = 0;
93
memset(&hints, 0, sizeof(struct addrinfo));
94
hints.ai_family = AF_UNSPEC;
95
hints.ai_socktype = SOCK_STREAM;
96
/* In Unix/Mac, getaddrinfo() is the most convenient way to get
97
* server information. */
98
if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
99
if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
100
/* The following two setsockopt() are used by ftplib
101
* (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
103
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
104
if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
105
if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
110
/* MinGW's printf has problem with "%lld" */
111
char *int64tostr(char *buf, int64_t x)
116
buf[i++] = '0' + x % 10;
120
for (cnt = i, i = 0; i < cnt/2; ++i) {
121
int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
126
int64_t strtoint64(const char *buf)
129
for (x = 0; *buf != '\0'; ++buf)
130
x = x * 10 + ((int64_t) *buf - 48);
133
/* In windows, the first thing is to establish the TCP connection. */
134
int knet_win32_init()
137
return WSAStartup(MAKEWORD(2, 2), &wsaData);
139
void knet_win32_destroy()
143
/* A slightly modfied version of the following function also works on
144
* Mac (and presummably Linux). However, this function is not stable on
145
* my Mac. It sometimes works fine but sometimes does not. Therefore for
146
* non-Windows OS, I do not use this one. */
147
static SOCKET socket_connect(const char *host, const char *port)
149
#define __err_connect(func) \
151
fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \
157
struct linger lng = { 0, 0 };
158
struct sockaddr_in server;
159
struct hostent *hp = 0;
161
if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
162
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
163
if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
165
if (isalpha(host[0])) hp = gethostbyname(host);
168
addr.s_addr = inet_addr(host);
169
hp = gethostbyaddr((char*)&addr, 4, AF_INET);
171
if (hp == 0) __err_connect("gethost");
173
server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
174
server.sin_family= AF_INET;
175
server.sin_port = htons(atoi(port));
176
if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
177
// freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
182
static off_t my_netread(int fd, void *buf, off_t len)
184
off_t rest = len, curr, l = 0;
185
/* recv() and read() may not read the required length of data with
186
* one call. They have to be called repeatedly. */
188
if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
189
curr = netread(fd, buf + l, rest);
190
/* According to the glibc manual, section 13.2, a zero returned
191
* value indicates end-of-file (EOF), which should mean that
192
* read() will not return zero if EOF has not been met but data
193
* are not immediately available. */
194
if (curr == 0) break;
195
l += curr; rest -= curr;
200
/*************************
201
* FTP specific routines *
202
*************************/
204
static int kftp_get_response(knetFile *ftp)
213
if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
214
while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
216
if (n >= ftp->max_response) {
217
ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
218
ftp->response = realloc(ftp->response, ftp->max_response);
220
ftp->response[n++] = c;
222
if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
223
&& ftp->response[3] != '-') break;
228
if (n < 2) return -1;
229
ftp->response[n-2] = 0;
230
return strtol(ftp->response, &p, 0);
233
static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
235
if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
236
netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
237
return is_get? kftp_get_response(ftp) : 0;
240
static int kftp_pasv_prep(knetFile *ftp)
244
kftp_send_cmd(ftp, "PASV\r\n", 1);
245
for (p = ftp->response; *p && *p != '('; ++p);
246
if (*p != '(') return -1;
248
sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
249
memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
250
ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
255
static int kftp_pasv_connect(knetFile *ftp)
257
char host[80], port[10];
258
if (ftp->pasv_port == 0) {
259
fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
262
sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
263
sprintf(port, "%d", ftp->pasv_port);
264
ftp->fd = socket_connect(host, port);
265
if (ftp->fd == -1) return -1;
269
int kftp_connect(knetFile *ftp)
271
ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
272
if (ftp->ctrl_fd == -1) return -1;
273
kftp_get_response(ftp);
274
kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
275
kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
276
kftp_send_cmd(ftp, "TYPE I\r\n", 1);
280
int kftp_reconnect(knetFile *ftp)
282
if (ftp->ctrl_fd != -1) {
283
netclose(ftp->ctrl_fd);
288
return kftp_connect(ftp);
291
// initialize ->type, ->host, ->retr and ->size
292
knetFile *kftp_parse_url(const char *fn, const char *mode)
297
if (strstr(fn, "ftp://") != fn) return 0;
298
for (p = (char*)fn + 6; *p && *p != '/'; ++p);
299
if (*p != '/') return 0;
301
fp = calloc(1, sizeof(knetFile));
302
fp->type = KNF_TYPE_FTP;
304
/* the Linux/Mac version of socket_connect() also recognizes a port
305
* like "ftp", but the Windows version does not. */
306
fp->port = strdup("21");
307
fp->host = calloc(l + 1, 1);
308
if (strchr(mode, 'c')) fp->no_reconnect = 1;
309
strncpy(fp->host, fn + 6, l);
310
fp->retr = calloc(strlen(p) + 8, 1);
311
sprintf(fp->retr, "RETR %s\r\n", p);
312
fp->size_cmd = calloc(strlen(p) + 8, 1);
313
sprintf(fp->size_cmd, "SIZE %s\r\n", p);
317
// place ->fd at offset off
318
int kftp_connect_file(knetFile *fp)
324
if (fp->no_reconnect) kftp_get_response(fp);
327
kftp_send_cmd(fp, fp->size_cmd, 1);
329
if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
331
fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
335
const char *p = fp->response;
336
while (*p != ' ') ++p;
337
while (*p < '0' || *p > '9') ++p;
338
file_size = strtoint64(p);
340
fp->file_size = file_size;
344
sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
346
strcpy(tmp, "REST ");
347
int64tostr(tmp + 5, fp->offset);
350
kftp_send_cmd(fp, tmp, 1);
352
kftp_send_cmd(fp, fp->retr, 0);
353
kftp_pasv_connect(fp);
354
ret = kftp_get_response(fp);
356
fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
366
/**************************
367
* HTTP specific routines *
368
**************************/
370
knetFile *khttp_parse_url(const char *fn, const char *mode)
375
if (strstr(fn, "http://") != fn) return 0;
377
for (p = (char*)fn + 7; *p && *p != '/'; ++p);
379
fp = calloc(1, sizeof(knetFile));
380
fp->http_host = calloc(l + 1, 1);
381
strncpy(fp->http_host, fn + 7, l);
382
fp->http_host[l] = 0;
383
for (q = fp->http_host; *q && *q != ':'; ++q);
384
if (*q == ':') *q++ = 0;
386
proxy = getenv("http_proxy");
387
// set ->host, ->port and ->path
389
fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
390
fp->port = strdup(*q? q : "80");
391
fp->path = strdup(*p? p : "/");
393
fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
394
for (q = fp->host; *q && *q != ':'; ++q);
395
if (*q == ':') *q++ = 0;
396
fp->port = strdup(*q? q : "80");
397
fp->path = strdup(fn);
399
fp->type = KNF_TYPE_HTTP;
400
fp->ctrl_fd = fp->fd = -1;
405
int khttp_connect_file(knetFile *fp)
409
if (fp->fd != -1) netclose(fp->fd);
410
fp->fd = socket_connect(fp->host, fp->port);
411
buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
412
l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
413
l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
414
l += sprintf(buf + l, "\r\n");
415
netwrite(fp->fd, buf, l);
417
while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
418
if (buf[l] == '\n' && l >= 3)
419
if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
423
if (l < 14) { // prematured header
428
ret = strtol(buf + 8, &p, 0); // HTTP return code
429
if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
430
off_t rest = fp->offset;
432
off_t l = rest < 0x10000? rest : 0x10000;
433
rest -= my_netread(fp->fd, buf, l);
435
} else if (ret != 206 && ret != 200) {
437
fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
447
/********************
449
********************/
451
knetFile *knet_open(const char *fn, const char *mode)
454
if (mode[0] != 'r') {
455
fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
458
if (strstr(fn, "ftp://") == fn) {
459
fp = kftp_parse_url(fn, mode);
460
if (fp == 0) return 0;
461
if (kftp_connect(fp) == -1) {
465
kftp_connect_file(fp);
466
} else if (strstr(fn, "http://") == fn) {
467
fp = khttp_parse_url(fn, mode);
468
if (fp == 0) return 0;
469
khttp_connect_file(fp);
470
} else { // local file
472
/* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
473
* be undefined on some systems, although it is defined on my
474
* Mac and the Linux I have tested on. */
475
int fd = open(fn, O_RDONLY | O_BINARY);
477
int fd = open(fn, O_RDONLY);
483
fp = (knetFile*)calloc(1, sizeof(knetFile));
484
fp->type = KNF_TYPE_LOCAL;
488
if (fp && fp->fd == -1) {
495
knetFile *knet_dopen(int fd, const char *mode)
497
knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
498
fp->type = KNF_TYPE_LOCAL;
503
off_t knet_read(knetFile *fp, void *buf, off_t len)
506
if (fp->fd == -1) return 0;
507
if (fp->type == KNF_TYPE_FTP) {
508
if (fp->is_ready == 0) {
509
if (!fp->no_reconnect) kftp_reconnect(fp);
510
kftp_connect_file(fp);
512
} else if (fp->type == KNF_TYPE_HTTP) {
513
if (fp->is_ready == 0)
514
khttp_connect_file(fp);
516
if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
517
off_t rest = len, curr;
520
curr = read(fp->fd, buf + l, rest);
521
} while (curr < 0 && EINTR == errno);
522
if (curr < 0) return -1;
523
if (curr == 0) break;
524
l += curr; rest -= curr;
526
} else l = my_netread(fp->fd, buf, len);
531
off_t knet_seek(knetFile *fp, int64_t off, int whence)
533
if (whence == SEEK_SET && off == fp->offset) return 0;
534
if (fp->type == KNF_TYPE_LOCAL) {
535
/* Be aware that lseek() returns the offset after seeking,
536
* while fseek() returns zero on success. */
537
off_t offset = lseek(fp->fd, off, whence);
539
// Be silent, it is OK for knet_seek to fail when the file is streamed
540
// fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
546
else if (fp->type == KNF_TYPE_FTP)
548
if (whence==SEEK_CUR)
550
else if (whence==SEEK_SET)
552
else if ( whence==SEEK_END)
553
fp->offset = fp->file_size+off;
557
else if (fp->type == KNF_TYPE_HTTP)
559
if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
560
fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
564
if (whence==SEEK_CUR)
566
else if (whence==SEEK_SET)
572
fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
576
int knet_close(knetFile *fp)
578
if (fp == 0) return 0;
579
if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
581
/* On Linux/Mac, netclose() is an alias of close(), but on
582
* Windows, it is an alias of closesocket(). */
583
if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
584
else netclose(fp->fd);
586
free(fp->host); free(fp->port);
587
free(fp->response); free(fp->retr); // FTP specific
588
free(fp->path); free(fp->http_host); // HTTP specific
602
buf = calloc(0x100000, 1);
604
fp = knet_open("knetfile.c", "r");
605
knet_seek(fp, 1000, SEEK_SET);
606
} else if (type == 1) { // NCBI FTP, large file
607
fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
608
knet_seek(fp, 2500000000ll, SEEK_SET);
609
l = knet_read(fp, buf, 255);
610
} else if (type == 2) {
611
fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
612
knet_seek(fp, 1000, SEEK_SET);
613
} else if (type == 3) {
614
fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
615
knet_seek(fp, 1000, SEEK_SET);
616
} else if (type == 4) {
617
fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
618
knet_read(fp, buf, 10000);
619
knet_seek(fp, 20000, SEEK_SET);
620
knet_seek(fp, 10000, SEEK_SET);
621
l = knet_read(fp, buf+10000, 10000000) + 10000;
623
if (type != 4 && type != 1) {
624
knet_read(fp, buf, 255);
627
} else write(fileno(stdout), buf, l);