6
from interPopula import Config
9
hapMapDir = Config.dataDir + os.sep + 'HapMap'
14
pass #Probably OK, dir already exists
15
dbConn = sqlite3.connect(hapMapDir + os.sep + 'db')
18
dbConn.execute('''CREATE TABLE individual (
26
except sqlite3.OperationalError, e:
28
pass #tables already exist...OK
31
dbConn.execute('''CREATE TABLE freq_snp (
33
population VARCHAR(20),
34
chromosome VARCHAR(2),
43
dbConn.execute('CREATE INDEX fq_snp_rs ON freq_snp(rs_id)')
44
dbConn.execute('CREATE INDEX fq_snp_pop ON freq_snp(population)')
45
dbConn.execute('CREATE INDEX fq_snp_chr ON freq_snp(chromosome)')
46
except sqlite3.OperationalError, e:
48
pass #tables already exist...OK
51
dbConn.execute('''CREATE TABLE geno_snp (
53
population VARCHAR(20),
54
chromosome VARCHAR(2),
59
dbConn.execute('CREATE INDEX gn_snp_rs ON geno_snp(rs_id)')
60
dbConn.execute('CREATE INDEX gn_snp_pop ON geno_snp(population)')
61
dbConn.execute('CREATE INDEX gn_snp_chr ON geno_snp(chromosome)')
62
dbConn.execute('CREATE INDEX gn_snp_cs_pop ON geno_snp(chromosome, population)')
63
except sqlite3.OperationalError, e:
65
pass #tables already exist...OK
71
"""Database of map between indivs and pops and pedigree
75
self.dbConn = prepareDataDir()
77
def get(self, forceLoad = False):
80
This will retrieve data from HapMap and load it on the database.
81
Load might fail, repeating might be a possibility.
83
hapMapDir = Config.dataDir + os.sep + 'HapMap'
84
if forceLoad or (len(self.getPops()) == 0):
85
tempHMFile = hapMapDir + os.sep + 'tmp_i.txt'
89
pass #Doesn't exist -> OK
90
ftp = FTP('ftp.ncbi.nlm.nih.gov')
91
ftpDir = 'hapmap/samples_individuals'
93
flist = ftp.nlst(ftpDir)
96
#XXX: if we dont fetch nothing we should halt and report
97
if fname.find("relation") > -1:
98
print 'Fetching' , fname
99
ftp.retrbinary('RETR ' + fname,
100
open(tempHMFile, 'wb').write)
104
self.dbConn.execute("DELETE FROM individual")
105
f = open(tempHMFile, 'r')
106
l = f.readline() #we jump the first
110
toks = l.rstrip().split('\t')
116
gender = int(toks[4])
119
self.dbConn.execute('''
120
INSERT INTO individual (
121
fid, iid, dad, mom, gender, pop)
122
VALUES (?,?,?,?,?,?)''',
123
(fid, iid, dad, mom, gender, pop))
129
'''Cleans the database. Caution...
131
self.dbConn.execute('DELETE FROM individual')
134
'''Removes tables from the database. Caution...
136
self.dbConn.execute('DROP TABLE individual')
139
"""Get the list of available pops."""
140
c = self.dbConn.cursor()
149
def getIndivsForPop(self, pop):
150
'''Gets a list of individuals for a pop
151
format: fid, iid, dad, mom, gender
153
c = self.dbConn.cursor()
155
SELECT fid, iid, dad, mom, gender
157
WHERE pop = ?''', (pop))
163
def getOfsForPop(self, pop):
164
'''Gets a list of offspring for a pop
167
c = self.dbConn.cursor()
171
WHERE dad IS NOT NULL
172
AND pop = ?''', (pop))
180
'''Closes the database.
182
self.dbConn.execute('VACUUM')
188
"""Database of frequency/genotype HapMap data.
193
self.dbConn = prepareDataDir()
195
def getPopsForChr(self, chr):
196
'''Check if database has a certain chromosome. Returns list of pops.
199
c = self.dbConn.cursor()
201
SELECT DISTINCT population
203
WHERE chromosome = ?''', chr)
208
def getChrsForPop(self, pop):
209
'''Check if database has a certain population. Returns list of chrs.
212
c = self.dbConn.cursor()
216
WHERE population = ?''', pop)
221
def hasChrPop(self, chr, pop):
222
'''Check if database has a certain chromosome and population.
224
c = self.dbConn.cursor()
225
#c.execute('select * from freq_snp')
232
AND population = ?''', (chr, pop))
240
def requireChrPop(self, chr, pop, forceLoad = False):
241
"""Require a chromosome and population
243
self.requireChrPopFreq(chr, pop, forceLoad)
244
self.requireChrPopGeno(chr, pop, forceLoad)
246
def requireChrPopFreq(self, chr, pop, forceLoad = False):
247
'''Requires a chromosome and population freq.
249
This will retrieve data from HapMap and load it on the database.
250
Hogs network and takes time and disk space!
251
Load might fail, repeating might be a possibility.
253
hapMapDir = Config.dataDir + os.sep + 'HapMap'
254
if forceLoad or (not self.hasChrPop(chr, pop)):
255
tempHMFile = hapMapDir + os.sep + 'tmp_hp.gz'
257
os.remove(tempHMFile)
259
pass #Doesn't exist -> OK
260
print 'Will load', chr, pop, 'NOTE: This will take a lot of time'
261
ftp = FTP('ftp.hapmap.org')
262
ftpDir = 'hapmap/frequencies/latest/forward/non-redundant'
264
flist = ftp.nlst(ftpDir)
265
#XXX: We are loading fwd_strand, non redundant... report on paper
268
#XXX: if we dont fetch nothing we should halt and report
269
if fname.find('_chr' + chr + '_' + pop) > -1:
270
print 'Fetching' , fname
271
ftp.retrbinary('RETR ' + fname,
272
open(tempHMFile, 'wb').write)
274
gz = gzip.open(tempHMFile, 'rb')
275
l = gz.readline() #we jump the first
277
print 'Injecting data'
278
self.dbConn.execute('''
282
AND population = ?''', (chr, pop))
284
toks = l.rstrip().split(' ')
286
rs_id = int(toks[0][2:])
291
homo1 = int(toks[12])
292
hetero = int(toks[15])
293
homo2 = int(toks[18])
294
self.dbConn.execute('''
295
INSERT INTO freq_snp (
296
rs_id, population, chromosome, position, strand,
297
nuc1, nuc2, homo1, hetero, homo2)
298
VALUES (?,?,?,?,?,?,?,?,?,?)''',
299
(rs_id, pop, chr, pos, strand,
300
nuc1, nuc2, homo1, hetero, homo2))
305
def requireChrPopGeno(self, chr, pop, forceLoad = False):
306
'''Requires a chromosome and population geno.
308
This will retrieve data from HapMap and load it on the database.
309
Hogs network and takes time and disk space!
310
Load might fail, repeating might be a possibility.
312
hapMapDir = Config.dataDir + os.sep + 'HapMap'
313
if forceLoad or (not self.hasChrPopGeno(chr, pop)):
314
tempHMFile = hapMapDir + os.sep + 'tmp_gn.gz'
316
os.remove(tempHMFile)
318
pass #Doesn't exist -> OK
319
print 'Will load geno ', chr, pop, 'NOTE: will take a lot of time'
320
ftp = FTP('ftp.hapmap.org')
321
ftpDir = 'hapmap/genotypes/latest/forward/non-redundant'
323
flist = ftp.nlst(ftpDir)
324
#XXX: We are loading fwd_strand, non redundant... report on paper
327
#XXX: if we dont fetch nothing we should halt and report
328
if fname.find('_chr' + chr + '_' + pop) > -1:
329
print 'Fetching' , fname
330
ftp.retrbinary('RETR ' + fname,
331
open(tempHMFile, 'wb').write)
333
gz = gzip.open(tempHMFile, 'rb')
334
l = gz.readline() #we get indivs
335
indivs = l.rstrip().split(' ')[11:]
337
print 'Injecting data'
338
self.dbConn.execute('''
342
AND population = ?''', (chr, pop))
344
toks = l.rstrip().split(' ')
346
rs_id = int(toks[0][2:])
348
for i in range(len(indivs)):
351
self.dbConn.execute('''
352
INSERT INTO geno_snp (
353
rs_id, population, chromosome, iid, nuc1, nuc2)
354
VALUES (?,?,?,?,?,?)''',
355
(rs_id, pop, chr, indivs[i], nuc1, nuc2))
362
def requirePop(self, pop):
363
'''Requires a population, all chromosomes.
365
This will retrieve data from HapMap and load it on the database.
366
Hogs network and takes time and disk space!
367
Load might fail, repeating might be a possibility.
369
for chr in range(1, 23):
370
self.requireChrPop(str(chr), pop)
371
self.requireChrPop('X', pop)
372
self.requireChrPop('Y', pop)
374
def requireChr(self, chr, pop_list):
375
'''Requires a chromosome for specified populations.
377
This will retrieve data from HapMap and load it on the database.
379
We let the user decide which populations to load, mainly because
380
we don't know between (JPT and CHB) or JPT+CHB.
382
Hogs network and takes time and disk space!
384
Load might fail, repeating might be a possibility.
387
self.requireChrPop(chr, pop)
390
'''Cleans the database. Caution...
392
self.dbConn.execute('DELETE FROM freq_snp')
393
self.dbConn.execute('DELETE FROM geno_snp')
396
'''Removes tables from the database. Caution...
398
self.dbConn.execute('DROP TABLE freq_snp')
399
self.dbConn.execute('DROP TABLE geno_snp')
401
def getRSsForInterval(self, chr, begin, end):
402
'''Gets a list of rs_ids for an interval (limits included)
404
c = self.dbConn.cursor()
406
SELECT DISTINCT rs_id
410
AND position <= ?''', (chr, begin, end))
413
rs_list.append(rs[0])
416
def getSNPs(self, rs_id):
417
'''Returns a list of SNPs for an rs_id (1 per pop with that SNP).
420
@return: List of SNPs.
422
c = self.dbConn.execute('''
425
WHERE rs_id = ?''', (rs_id,))
432
'''Closes the database.
434
self.dbConn.execute('VACUUM')
437
def hasChrPopGeno(self, chr, pop):
438
'''Check if database has a certain chromosome and population (geno).
440
c = self.dbConn.cursor()
445
AND population = ?''', (chr, pop))
453
def getGenepop(self, f, rsIds, pops, allowOfs = True, doMale = True):
454
"""Generate genepop file
455
allowOfs - allow offspring
456
doMale - if chr X, do Male or Female. Male is haploid
460
f.write("Data for " + str(pops) + "\n")
461
c = self.dbConn.cursor()
467
SELECT chromosome, position
469
WHERE rs_id = ?''', (rsId,))
470
chro, pos = iter(c).next()
472
f.write("%s/%s/%d\n" % (rsId, chro, pos))
475
c = self.dbConn.cursor()
477
c.execute("select iid, gender from individual where pop=?", (pop))
479
c.execute("select iid, gender from individual where pop=? and dad=0 and mom=0",
481
for iid, gender in c:
482
chro = rsType[rsIds[0]] #wrong
483
if chro != "X" or (gender==1 and doMale) or (gender==2 and not doMale): f.write("%s," % (iid))
485
c = self.dbConn.cursor()
486
c.execute("select nuc1, nuc2 from geno_snp where iid=? and rs_id=?",(iid, rsId))
488
n1, n2 = iter(c).next()
489
except StopIteration, e:
493
f.write(" %s%s" % (self.gp(n1), self.gp(n2)))
494
elif doMale and gender==1:
495
f.write(" %s" % (self.gp(n1)))
496
elif (not doMale) and gender==2:
497
f.write(" %s%s" % (self.gp(n1), self.gp(n2)))
498
if chro != "X" or (gender==1 and doMale) or (gender==2 and not doMale): f.write("\n")
501
if st =="N": return "00"
502
if st =="A": return "01"
503
if st =="C": return "02"
504
if st =="T": return "03"
505
if st =="G": return "04"