~ubuntu-branches/ubuntu/utopic/r-bioc-cummerbund/utopic

« back to all changes in this revision

Viewing changes to R/database-setup.R

  • Committer: Package Import Robot
  • Author(s): Andreas Tille
  • Date: 2013-12-28 17:17:25 UTC
  • mfrom: (1.1.2)
  • Revision ID: package-import@ubuntu.com-20131228171725-polmzo8go4m371c6
Tags: 2.4.1-1
* New upstream version
* debian/rules: Remove useless creation of ${R-Depends}
* debian/control: Versioned Build-Depends: r-base-dev (>= 3.0)
* debian/README.test: add hint how to test the package

Show diffs side-by-side

added added

removed removed

Lines of Context:
7
7
#####################
8
8
#File Archetype parsing
9
9
#####################
 
10
 
 
11
#RunInfo
 
12
loadRunInfo<-function(runInfoFile,
 
13
                dbConn,
 
14
                path,
 
15
                fileArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
 
16
                sep="\t",
 
17
                na.string = "-",
 
18
                header = TRUE,
 
19
                quote = "",
 
20
                stringsAsFactors=FALSE,
 
21
                row.names=NULL,
 
22
                ...) {
 
23
        
 
24
        #Setup and reporting
 
25
        write(paste("Reading Run Info File ",runInfoFile,sep=""),stderr())
 
26
        fileArgs$file = runInfoFile
 
27
        
 
28
        #Read Run Info file
 
29
        runInfo = as.data.frame(do.call(read.table,fileArgs))
 
30
        
 
31
        #Parsing
 
32
        #not needed...
 
33
        
 
34
        #Load into database (runInfo table)
 
35
        write("Writing runInfo Table",stderr())
 
36
        insert_SQL<-'INSERT INTO runInfo VALUES(:param, :value)'
 
37
        bulk_insert(dbConn,insert_SQL,runInfo)
 
38
        
 
39
}
 
40
 
 
41
#ReplicateTable
 
42
loadRepTable<-function(repTableFile,
 
43
                dbConn,
 
44
                path,
 
45
                fileArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
 
46
                sep="\t",
 
47
                na.string = "-",
 
48
                header = TRUE,
 
49
                quote = "",
 
50
                stringsAsFactors=FALSE,
 
51
                row.names=NULL,
 
52
                ...) {
 
53
                
 
54
        #Setup and reporting
 
55
        write(paste("Reading Read Group Info  ",repTableFile,sep=""),stderr())
 
56
        fileArgs$file = repTableFile
 
57
        
 
58
        #Read Run Info file
 
59
        full = as.data.frame(do.call(read.table,fileArgs))
 
60
        #print(head(full))
 
61
        
 
62
        #Fix sample_names
 
63
        full$condition<-make.db.names(dbConn,as.character(full$condition),unique=FALSE)
 
64
        
 
65
        #Parsing
 
66
        #For now, I need to concatenate condition and replicate number
 
67
        full$rep_name<-paste(full$condition,full$replicate_num,sep="_")
 
68
        
 
69
        #Load into database (replicates table)
 
70
        write("Writing replicates Table",stderr())
 
71
        insert_SQL<-'INSERT INTO replicates VALUES(:file, :condition, :replicate_num, :rep_name, :total_mass, :norm_mass, :internal_scale, :external_scale)'
 
72
        bulk_insert(dbConn,insert_SQL,full)
 
73
}
 
74
 
10
75
#Genes
11
76
loadGenes<-function(fpkmFile,
12
77
                diffFile,
13
78
                promoterFile,
 
79
                countFile,
 
80
                replicateFile,
14
81
                dbConn,
15
82
                path,
16
83
                #Arguments to read.* methods
17
84
                fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
18
85
                diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
19
86
                promoterArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
 
87
                countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
 
88
                replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
20
89
                sep="\t",
21
90
                na.string = "-",
22
91
                header = TRUE,
161
230
        #Handle Feature Data (this will actually be done on CuffData objects instead...but I may include something here as well)
162
231
        #########
163
232
        
 
233
        ###########
 
234
        #Handle Counts .count_tracking
 
235
        ###########
 
236
        if(file.exists(countFile)){
 
237
                
 
238
                idCols = c(1)
 
239
                
 
240
                #Read countFile
 
241
                write(paste("Reading ", countFile,sep=""),stderr())
 
242
                countArgs$file = countFile
 
243
                counts<-as.data.frame(do.call(read.table,countArgs))
 
244
                
 
245
                if(dim(counts)[1]>0){
 
246
                        #Reshape geneCount table
 
247
                        write("Reshaping geneCount table",stderr())
 
248
                        countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
 
249
                        colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
 
250
                        
 
251
                        countmelt$measurement = ""
 
252
                        
 
253
                        countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
 
254
                        countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
 
255
                        countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
 
256
                        countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
 
257
                        countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
 
258
                        
 
259
                        countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
 
260
                        countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
 
261
                        countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
 
262
                        countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
 
263
                        countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
 
264
                        
 
265
                        #Adjust sample names with make.db.names
 
266
                        countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
 
267
                        
 
268
                        #Recast
 
269
                        write("Recasting",stderr())
 
270
                        countmelt<-as.data.frame(dcast(countmelt,...~measurement))
 
271
                        
 
272
                        #debugging
 
273
                        #write(colnames(countmelt),stderr())
 
274
                        
 
275
        
 
276
                        #Write geneCount table
 
277
                        write("Writing geneCount table",stderr())
 
278
                        insert_SQL<-'INSERT INTO geneCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
 
279
                        bulk_insert(dbConn,insert_SQL,countmelt)
 
280
                }else{
 
281
                        write(paste("No records found in", countFile),stderr())
 
282
                }
 
283
                
 
284
        }
 
285
                
 
286
                
 
287
        ###########
 
288
        #Handle Replicates .rep_tracking
 
289
        ###########
 
290
        if(file.exists(replicateFile)){
 
291
 
 
292
                idCols = 1
 
293
                #Read countFile
 
294
                write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
 
295
                replicateArgs$file = replicateFile
 
296
                reps<-as.data.frame(do.call(read.table,replicateArgs))
 
297
                #print(head(reps))
 
298
                
 
299
                if(dim(reps)[1]>0){
 
300
                
 
301
                        #Adjust sample names with make.db.names
 
302
                        reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
 
303
                
 
304
                        #Create unique rep name
 
305
                        reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
 
306
                        colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
 
307
                        
 
308
                        #Write geneReplicateData table
 
309
                        write("Writing geneReplicateData table",stderr())
 
310
                        insert_SQL<-'INSERT INTO geneReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
 
311
                        bulk_insert(dbConn,insert_SQL,reps)
 
312
                }else{
 
313
                        write(paste("No records found in", replicateFile),stderr())
 
314
                }
 
315
                
 
316
        }
 
317
        
164
318
}
165
319
        
166
320
#Isoforms
167
321
loadIsoforms<-function(fpkmFile,
168
322
                diffFile,
 
323
                countFile,
 
324
                replicateFile,
169
325
                dbConn,
170
326
                path,
171
327
                #Arguments to read.* methods
172
328
                fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
173
329
                diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
 
330
                countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
 
331
                replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
174
332
                sep="\t",
175
333
                na.string = "-",
176
334
                header = TRUE,
226
384
        #This is a temporary fix until p_id is added to the 'isoforms.fpkm_tracking' file
227
385
        isoformsTable<-cbind(isoformsTable[,1:2],data.frame(CDS_id=rep("NA",dim(isoformsTable)[1])),isoformsTable[,-c(1:2)])
228
386
        #print (head(isoformsTable))
 
387
        
229
388
        write("Writing isoforms table",stderr())
230
389
        #dbWriteTable(dbConn,'isoforms',as.data.frame(isoformsTable),row.names=F,append=T)
231
390
        insert_SQL<-'INSERT INTO isoforms VALUES(?,?,?,?,?,?,?,?,?,?)'
287
446
                }
288
447
        }
289
448
        
 
449
        ###########
 
450
        #Handle Counts .count_tracking
 
451
        ###########
 
452
        if(file.exists(countFile)){
 
453
                
 
454
                idCols = c(1)
 
455
                
 
456
                #Read countFile
 
457
                write(paste("Reading ", countFile,sep=""),stderr())
 
458
                countArgs$file = countFile
 
459
                counts<-as.data.frame(do.call(read.table,countArgs))
 
460
                
 
461
                if(dim(counts)[1]>0){
 
462
                
 
463
                        #Reshape isoformCount table
 
464
                        write("Reshaping isoformCount table",stderr())
 
465
                        countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
 
466
                        colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
 
467
                        
 
468
                        countmelt$measurement = ""
 
469
                        
 
470
                        countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
 
471
                        countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
 
472
                        countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
 
473
                        countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
 
474
                        countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
 
475
                        
 
476
                        countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
 
477
                        countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
 
478
                        countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
 
479
                        countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
 
480
                        countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
 
481
                        
 
482
                        #Adjust sample names with make.db.names
 
483
                        countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
 
484
                        
 
485
                        
 
486
                        #Recast
 
487
                        write("Recasting",stderr())
 
488
                        countmelt<-as.data.frame(dcast(countmelt,...~measurement))
 
489
                        
 
490
                        #debugging
 
491
                        #write(colnames(countmelt),stderr())
 
492
                        
 
493
                        
 
494
                        #Write isoformCount table
 
495
                        write("Writing isoformCount table",stderr())
 
496
                        insert_SQL<-'INSERT INTO isoformCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
 
497
                        bulk_insert(dbConn,insert_SQL,countmelt)
 
498
                }else{
 
499
                        write(paste("No records found in",countFile),stderr())
 
500
                }
 
501
        }
 
502
        
 
503
        
 
504
        ###########
 
505
        #Handle Replicates .rep_tracking
 
506
        ###########
 
507
        if(file.exists(replicateFile)){
 
508
                
 
509
                idCols = 1
 
510
                #Read countFile
 
511
                write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
 
512
                replicateArgs$file = replicateFile
 
513
                reps<-as.data.frame(do.call(read.table,replicateArgs))
 
514
                #print(head(reps))
 
515
                
 
516
                if(dim(reps)[1]>0){
 
517
                        
 
518
                        #Adjust sample names with make.db.names
 
519
                        reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
 
520
                
 
521
                        #Create unique rep name
 
522
                        reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
 
523
                        colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
 
524
                        
 
525
                        #Write isoformReplicateData table
 
526
                        write("Writing isoformReplicateData table",stderr())
 
527
                        insert_SQL<-'INSERT INTO isoformReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
 
528
                        bulk_insert(dbConn,insert_SQL,reps)
 
529
                }else{
 
530
                        write(paste("No records found in",replicateFile),stderr())
 
531
                }       
 
532
        }
 
533
        
290
534
}
291
535
 
292
536
#TSS groups
293
537
loadTSS<-function(fpkmFile,
294
538
                diffFile,
295
539
                splicingFile,
 
540
                countFile,
 
541
                replicateFile,
296
542
                dbConn,
297
543
                path,
298
544
                #Arguments to read.* methods
299
545
                fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
300
546
                diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
301
547
                splicingArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
 
548
                countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
 
549
                replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
302
550
                sep="\t",
303
551
                na.string = "-",
304
552
                header = TRUE,
345
593
        }
346
594
        
347
595
        ######
348
 
        #Populate genes table
 
596
        #Populate TSS table
349
597
        ######
350
598
        tssTable<-full[,c(1:5,7:9)]
351
599
        write("Writing TSS table",stderr())
435
683
                }
436
684
        }
437
685
        
 
686
        ###########
 
687
        #Handle Counts .count_tracking
 
688
        ###########
 
689
        if(file.exists(countFile)){
 
690
                
 
691
                idCols = c(1)
 
692
                
 
693
                #Read countFile
 
694
                write(paste("Reading ", countFile,sep=""),stderr())
 
695
                countArgs$file = countFile
 
696
                counts<-as.data.frame(do.call(read.table,countArgs))
 
697
                
 
698
                if(dim(counts)[1]>0){
 
699
                
 
700
                        #Reshape TSSCount table
 
701
                        write("Reshaping TSSCount table",stderr())
 
702
                        countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
 
703
                        colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
 
704
                        
 
705
                        countmelt$measurement = ""
 
706
                        
 
707
                        countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
 
708
                        countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
 
709
                        countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
 
710
                        countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
 
711
                        countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
 
712
                        
 
713
                        countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
 
714
                        countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
 
715
                        countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
 
716
                        countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
 
717
                        countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
 
718
                        
 
719
                        #Adjust sample names with make.db.names
 
720
                        countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
 
721
                        
 
722
                        
 
723
                        #Recast
 
724
                        write("Recasting",stderr())
 
725
                        countmelt<-as.data.frame(dcast(countmelt,...~measurement))
 
726
                        
 
727
                        #debugging
 
728
                        #write(colnames(countmelt),stderr())
 
729
                        
 
730
                        
 
731
                        #Write TSSCount table
 
732
                        write("Writing TSSCount table",stderr())
 
733
                        insert_SQL<-'INSERT INTO TSSCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
 
734
                        bulk_insert(dbConn,insert_SQL,countmelt)
 
735
                }else{
 
736
                        write(paste("No records found in",countFile),stderr())
 
737
                }
 
738
        }
 
739
        
 
740
        
 
741
        ###########
 
742
        #Handle Replicates .rep_tracking
 
743
        ###########
 
744
        if(file.exists(replicateFile)){
 
745
                
 
746
                idCols = 1
 
747
                #Read countFile
 
748
                write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
 
749
                replicateArgs$file = replicateFile
 
750
                reps<-as.data.frame(do.call(read.table,replicateArgs))
 
751
                #print(head(reps))
 
752
                
 
753
                if(dim(reps)[1]>0){
 
754
                                
 
755
                        #Adjust sample names with make.db.names
 
756
                        reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
 
757
                
 
758
                        #Create unique rep name
 
759
                        reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
 
760
                        colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
 
761
                        
 
762
                        #Write TSSReplicateData table
 
763
                        write("Writing TSSReplicateData table",stderr())
 
764
                        insert_SQL<-'INSERT INTO TSSReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
 
765
                        bulk_insert(dbConn,insert_SQL,reps)
 
766
                }else{
 
767
                        write(paste("No records found in",replicateFile),stderr())
 
768
                }
 
769
                
 
770
        }
 
771
        
438
772
}
439
773
 
440
774
#CDS
441
775
loadCDS<-function(fpkmFile,
442
776
                diffFile,
443
777
                CDSDiff,
 
778
                countFile,
 
779
                replicateFile,
444
780
                dbConn,
445
781
                path,
446
782
                #Arguments to read.* methods
447
783
                fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
448
784
                diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
449
785
                CDSDiffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
 
786
                countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
 
787
                replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
450
788
                sep="\t",
451
789
                na.string = "-",
452
790
                header = TRUE,
494
832
        }
495
833
        
496
834
        ######
497
 
        #Populate genes table
 
835
        #Populate CDS table
498
836
        ######
499
837
        cdsTable<-full[,c(1:5,6:9)]
500
838
        write("Writing CDS table",stderr())
585
923
                }
586
924
        }
587
925
        
 
926
        ###########
 
927
        #Handle Counts .count_tracking
 
928
        ###########
 
929
        if(file.exists(countFile)){
 
930
                
 
931
                idCols = c(1)
 
932
                
 
933
                #Read countFile
 
934
                write(paste("Reading ", countFile,sep=""),stderr())
 
935
                countArgs$file = countFile
 
936
                counts<-as.data.frame(do.call(read.table,countArgs))
 
937
                
 
938
                if(dim(counts)[1]>0){
 
939
                
 
940
                        #Reshape CDSCount table
 
941
                        write("Reshaping CDSCount table",stderr())
 
942
                        countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
 
943
                        colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
 
944
                        
 
945
                        countmelt$measurement = ""
 
946
                        
 
947
                        countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
 
948
                        countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
 
949
                        countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
 
950
                        countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
 
951
                        countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
 
952
                        
 
953
                        countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
 
954
                        countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
 
955
                        countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
 
956
                        countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
 
957
                        countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
 
958
                        
 
959
                        #Adjust sample names with make.db.names
 
960
                        countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
 
961
                        
 
962
                        
 
963
                        #Recast
 
964
                        write("Recasting",stderr())
 
965
                        countmelt<-as.data.frame(dcast(countmelt,...~measurement))
 
966
                        
 
967
                        #debugging
 
968
                        #write(colnames(countmelt),stderr())
 
969
                        
 
970
                        
 
971
                        #Write CDSCount table
 
972
                        write("Writing CDSCount table",stderr())
 
973
                        insert_SQL<-'INSERT INTO CDSCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
 
974
                        bulk_insert(dbConn,insert_SQL,countmelt)
 
975
                }else{
 
976
                        write(paste("No records found in",countFile),stderr())
 
977
                }
 
978
        }
 
979
        
 
980
        
 
981
        ###########
 
982
        #Handle Replicates .rep_tracking
 
983
        ###########
 
984
        if(file.exists(replicateFile)){
 
985
                
 
986
                idCols = 1
 
987
                #Read countFile
 
988
                write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
 
989
                replicateArgs$file = replicateFile
 
990
                reps<-as.data.frame(do.call(read.table,replicateArgs))
 
991
                #print(head(reps))
 
992
                
 
993
                if(dim(reps)[1]>0){
 
994
                                
 
995
                        #Adjust sample names with make.db.names
 
996
                        reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
 
997
                
 
998
                        #Create unique rep name
 
999
                        reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
 
1000
                        colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
 
1001
                        
 
1002
                        #Write CDSReplicateData table
 
1003
                        write("Writing CDSReplicateData table",stderr())
 
1004
                        insert_SQL<-'INSERT INTO CDSReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
 
1005
                        bulk_insert(dbConn,insert_SQL,reps)
 
1006
                }else{
 
1007
                        write(paste("No records found in",replicateFile),stderr())
 
1008
                }
 
1009
                
 
1010
        }
 
1011
        
588
1012
}
589
1013
 
590
1014
########################
596
1020
#Database Setup Functions
597
1021
#####################
598
1022
 
599
 
createDB<-function(dbFname="cuffData.db",driver="SQLite") {
600
 
        #Builds sqlite db at 'dbFname' and returns a dbConnect object pointing to newly created database.
601
 
        #May be deprecated if I can load first and index later...
602
 
        
603
 
        drv<-dbDriver(driver)
604
 
        db <- dbConnect(drv,dbname=dbFname)
605
 
        
606
 
        schema.text<-'
607
 
PRAGMA foreign_keys = OFF;
608
 
PRAGMA synchronous = OFF;
609
 
PRAGMA journal_mode = MEMORY;
610
 
BEGIN;
611
 
DROP TABLE IF EXISTS "genes";
612
 
CREATE TABLE "genes"(
613
 
  "gene_id" VARCHAR(45) PRIMARY KEY NOT NULL,
614
 
  "class_code" VARCHAR(45),
615
 
  "nearest_ref_id" VARCHAR(45),
616
 
  "gene_short_name" VARCHAR(45),
617
 
  "locus" VARCHAR(45),
618
 
  "length" INTEGER,
619
 
  "coverage" FLOAT
620
 
);
621
 
DROP TABLE IF EXISTS "biasData";
622
 
CREATE TABLE "biasData"(
623
 
  "biasData_id" INTEGER PRIMARY KEY NOT NULL
624
 
);
625
 
DROP TABLE IF EXISTS "samples";
626
 
CREATE TABLE "samples"(
627
 
  "sample_index" INTEGER PRIMARY KEY NOT NULL,
628
 
  "sample_name" VARCHAR(45) NOT NULL
629
 
);
630
 
DROP TABLE IF EXISTS "TSS";
631
 
CREATE TABLE "TSS"(
632
 
  "TSS_group_id" VARCHAR(45) PRIMARY KEY NOT NULL,
633
 
  "class_code" VARCHAR(45),
634
 
  "nearest_ref_id" VARCHAR(45),
635
 
  "gene_id" VARCHAR(45) NOT NULL,
636
 
  "gene_short_name" VARCHAR(45),
637
 
  "locus" VARCHAR(45),
638
 
  "length" INTEGER,
639
 
  "coverage" FLOAT,
640
 
  CONSTRAINT "fk_TSS_genes1"
641
 
    FOREIGN KEY("gene_id")
642
 
    REFERENCES "genes"("gene_id")
643
 
);
644
 
CREATE INDEX "TSS.fk_TSS_genes1" ON "TSS"("gene_id");
645
 
CREATE INDEX "TSS.fk_TSS_genes2" ON "TSS"("gene_short_name");
646
 
DROP TABLE IF EXISTS "TSSData";
647
 
CREATE TABLE "TSSData"(
648
 
  "TSS_group_id" VARCHAR(45) NOT NULL,
649
 
  "sample_name" VARCHAR(45) NOT NULL,
650
 
  "fpkm" FLOAT,
651
 
  "conf_hi" FLOAT,
652
 
  "conf_lo" FLOAT,
653
 
  "quant_status" VARCHAR(45),
654
 
  CONSTRAINT "fk_TSSData_TSS1"
655
 
    FOREIGN KEY("TSS_group_id")
656
 
    REFERENCES "TSS"("TSS_group_id"),
657
 
  CONSTRAINT "fk_TSSData_samples1"
658
 
    FOREIGN KEY("sample_name")
659
 
    REFERENCES "samples"("sample_name")
660
 
);
661
 
CREATE INDEX "TSSData.fk_TSSData_TSS1" ON "TSSData"("TSS_group_id");
662
 
CREATE INDEX "TSSData.fk_TSSData_samples1" ON "TSSData"("sample_name");
663
 
DROP TABLE IF EXISTS "CDS";
664
 
CREATE TABLE "CDS"(
665
 
  "CDS_id" VARCHAR(45) PRIMARY KEY NOT NULL,
666
 
  "class_code" VARCHAR(45),
667
 
  "nearest_ref_id" VARCHAR(45),
668
 
  "gene_id" VARCHAR(45),
669
 
  "gene_short_name" VARCHAR(45),
670
 
  "TSS_group_id" VARCHAR(45),
671
 
  "locus" VARCHAR(45),
672
 
  "length" INTEGER,
673
 
  "coverage" FLOAT,
674
 
  CONSTRAINT "fk_CDS_genes1"
675
 
    FOREIGN KEY("gene_id")
676
 
    REFERENCES "genes"("gene_id"),
677
 
  CONSTRAINT "fk_CDS_TSS1"
678
 
    FOREIGN KEY("TSS_group_id")
679
 
    REFERENCES "TSS"("TSS_group_id")
680
 
);
681
 
CREATE INDEX "CDS.fk_CDS_genes1" ON "CDS"("gene_id");
682
 
CREATE INDEX "CDS.fk_CDS_genes2" ON "CDS"("gene_short_name");
683
 
CREATE INDEX "CDS.fk_CDS_TSS1" ON "CDS"("TSS_group_id");
684
 
DROP TABLE IF EXISTS "CDSData";
685
 
CREATE TABLE "CDSData"(
686
 
  "CDS_id" VARCHAR(45) NOT NULL,
687
 
  "sample_name" VARCHAR(45) NOT NULL,
688
 
  "fpkm" FLOAT,
689
 
  "conf_hi" FLOAT,
690
 
  "conf_lo" FLOAT,
691
 
  "quant_status" VARCHAR(45),
692
 
  CONSTRAINT "fk_CDSData_CDS1"
693
 
    FOREIGN KEY("CDS_id")
694
 
    REFERENCES "CDS"("CDS_id"),
695
 
  CONSTRAINT "fk_CDSData_samples1"
696
 
    FOREIGN KEY("sample_name")
697
 
    REFERENCES "samples"("sample_name")
698
 
);
699
 
CREATE INDEX "CDSData.fk_CDSData_CDS1" ON "CDSData"("CDS_id");
700
 
CREATE INDEX "CDSData.fk_CDSData_samples1" ON "CDSData"("sample_name");
701
 
DROP TABLE IF EXISTS "splicingDiffData";
702
 
CREATE TABLE "splicingDiffData"(
703
 
  "TSS_group_id" VARCHAR(45) NOT NULL,
704
 
  "gene_id" VARCHAR(45) NOT NULL,
705
 
  "sample_1" VARCHAR(45) NOT NULL,
706
 
  "sample_2" VARCHAR(45) NOT NULL,
707
 
  "status" VARCHAR(45),
708
 
  "value_1" FLOAT,
709
 
  "value_2" FLOAT,
710
 
  "JS_dist" FLOAT,
711
 
  "test_stat" FLOAT,
712
 
  "p_value" FLOAT,
713
 
  "q_value" FLOAT,
714
 
  "significant" VARCHAR(45),
715
 
  CONSTRAINT "fk_splicingDiffData_samples1"
716
 
    FOREIGN KEY("sample_1")
717
 
    REFERENCES "samples"("sample_name"),
718
 
  CONSTRAINT "fk_splicingDiffData_samples2"
719
 
    FOREIGN KEY("sample_2")
720
 
    REFERENCES "samples"("sample_name"),
721
 
  CONSTRAINT "fk_splicingDiffData_TSS1"
722
 
    FOREIGN KEY("TSS_group_id")
723
 
    REFERENCES "TSS"("TSS_group_id"),
724
 
  CONSTRAINT "fk_splicingDiffData_genes1"
725
 
    FOREIGN KEY("gene_id")
726
 
    REFERENCES "genes"("gene_id")
727
 
);
728
 
CREATE INDEX "splicingDiffData.fk_splicingDiffData_samples1" ON "splicingDiffData"("sample_1");
729
 
CREATE INDEX "splicingDiffData.fk_splicingDiffData_samples2" ON "splicingDiffData"("sample_2");
730
 
CREATE INDEX "splicingDiffData.fk_splicingDiffData_TSS1" ON "splicingDiffData"("TSS_group_id");
731
 
CREATE INDEX "splicingDiffData.fk_splicingDiffData_genes1" ON "splicingDiffData"("gene_id");
732
 
DROP TABLE IF EXISTS "TSSExpDiffData";
733
 
CREATE TABLE "TSSExpDiffData"(
734
 
  "TSS_group_id" VARCHAR(45) NOT NULL,
735
 
  "sample_1" VARCHAR(45) NOT NULL,
736
 
  "sample_2" VARCHAR(45) NOT NULL,
737
 
  "status" VARCHAR(45),
738
 
  "value_1" FLOAT,
739
 
  "value_2" FLOAT,
740
 
  "log2_fold_change" FLOAT,
741
 
  "test_stat" FLOAT,
742
 
  "p_value" FLOAT,
743
 
  "q_value" FLOAT,
744
 
  "significant" VARCHAR(45),
745
 
  CONSTRAINT "fk_TSSExpDiffData_TSS1"
746
 
    FOREIGN KEY("TSS_group_id")
747
 
    REFERENCES "TSS"("TSS_group_id"),
748
 
  CONSTRAINT "fk_TSSExpDiffData_samples1"
749
 
    FOREIGN KEY("sample_1")
750
 
    REFERENCES "samples"("sample_name"),
751
 
  CONSTRAINT "fk_TSSExpDiffData_samples2"
752
 
    FOREIGN KEY("sample_2")
753
 
    REFERENCES "samples"("sample_name")
754
 
);
755
 
CREATE INDEX "TSSExpDiffData.fk_TSSExpDiffData_TSS1" ON "TSSExpDiffData"("TSS_group_id");
756
 
CREATE INDEX "TSSExpDiffData.fk_TSSExpDiffData_samples1" ON "TSSExpDiffData"("sample_1");
757
 
CREATE INDEX "TSSExpDiffData.fk_TSSExpDiffData_samples2" ON "TSSExpDiffData"("sample_2");
758
 
DROP TABLE IF EXISTS "CDSDiffData";
759
 
CREATE TABLE "CDSDiffData"(
760
 
  "gene_id" VARCHAR(45) NOT NULL,
761
 
  "sample_1" VARCHAR(45) NOT NULL,
762
 
  "sample_2" VARCHAR(45) NOT NULL,
763
 
  "status" VARCHAR(45),
764
 
  "value_1" FLOAT,
765
 
  "value_2" FLOAT,
766
 
  "JS_dist" FLOAT,
767
 
  "test_stat" FLOAT,
768
 
  "p_value" FLOAT,
769
 
  "q_value" FLOAT,
770
 
  "significant" VARCHAR(45),
771
 
  CONSTRAINT "fk_CDSDiffData_samples1"
772
 
    FOREIGN KEY("sample_1")
773
 
    REFERENCES "samples"("sample_name"),
774
 
  CONSTRAINT "fk_CDSDiffData_samples2"
775
 
    FOREIGN KEY("sample_2")
776
 
    REFERENCES "samples"("sample_name"),
777
 
  CONSTRAINT "fk_CDSDiffData_genes1"
778
 
    FOREIGN KEY("gene_id")
779
 
    REFERENCES "genes"("gene_id")
780
 
);
781
 
CREATE INDEX "CDSDiffData.fk_CDSDiffData_samples1" ON "CDSDiffData"("sample_1");
782
 
CREATE INDEX "CDSDiffData.fk_CDSDiffData_samples2" ON "CDSDiffData"("sample_2");
783
 
CREATE INDEX "CDSDiffData.fk_CDSDiffData_genes1" ON "CDSDiffData"("gene_id");
784
 
DROP TABLE IF EXISTS "CDSExpDiffData";
785
 
CREATE TABLE "CDSExpDiffData"(
786
 
  "CDS_id" VARCHAR(45) NOT NULL,
787
 
  "sample_1" VARCHAR(45) NOT NULL,
788
 
  "sample_2" VARCHAR(45) NOT NULL,
789
 
  "status" VARCHAR(45),
790
 
  "value_1" FLOAT,
791
 
  "value_2" FLOAT,
792
 
  "log2_fold_change" FLOAT,
793
 
  "test_stat" FLOAT,
794
 
  "p_value" FLOAT,
795
 
  "q_value" FLOAT,
796
 
  "significant" VARCHAR(45),
797
 
  CONSTRAINT "fk_CDSExpDiffData_CDS1"
798
 
    FOREIGN KEY("CDS_id")
799
 
    REFERENCES "CDS"("CDS_id"),
800
 
  CONSTRAINT "fk_CDSExpDiffData_samples1"
801
 
    FOREIGN KEY("sample_1")
802
 
    REFERENCES "samples"("sample_name"),
803
 
  CONSTRAINT "fk_CDSExpDiffData_samples2"
804
 
    FOREIGN KEY("sample_2")
805
 
    REFERENCES "samples"("sample_name")
806
 
);
807
 
CREATE INDEX "CDSExpDiffData.fk_CDSExpDiffData_CDS1" ON "CDSExpDiffData"("CDS_id");
808
 
CREATE INDEX "CDSExpDiffData.fk_CDSExpDiffData_samples1" ON "CDSExpDiffData"("sample_1");
809
 
CREATE INDEX "CDSExpDiffData.fk_CDSExpDiffData_samples2" ON "CDSExpDiffData"("sample_2");
810
 
DROP TABLE IF EXISTS "promoterDiffData";
811
 
CREATE TABLE "promoterDiffData"(
812
 
  "gene_id" VARCHAR(45) NOT NULL,
813
 
  "sample_1" VARCHAR(45) NOT NULL,
814
 
  "sample_2" VARCHAR(45) NOT NULL,
815
 
  "status" VARCHAR(45),
816
 
  "value_1" FLOAT,
817
 
  "value_2" FLOAT,
818
 
  "JS_dist" FLOAT,
819
 
  "test_stat" FLOAT,
820
 
  "p_value" FLOAT,
821
 
  "q_value" FLOAT,
822
 
  "significant" VARCHAR(45),
823
 
  CONSTRAINT "fk_promoterDiffData_genes1"
824
 
    FOREIGN KEY("gene_id")
825
 
    REFERENCES "genes"("gene_id"),
826
 
  CONSTRAINT "fk_promoterDiffData_samples1"
827
 
    FOREIGN KEY("sample_1")
828
 
    REFERENCES "samples"("sample_name"),
829
 
  CONSTRAINT "fk_promoterDiffData_samples2"
830
 
    FOREIGN KEY("sample_2")
831
 
    REFERENCES "samples"("sample_name")
832
 
);
833
 
CREATE INDEX "promoterDiffData.fk_promoterDiffData_genes1" ON "promoterDiffData"("gene_id");
834
 
CREATE INDEX "promoterDiffData.fk_promoterDiffData_samples1" ON "promoterDiffData"("sample_1");
835
 
CREATE INDEX "promoterDiffData.fk_promoterDiffData_samples2" ON "promoterDiffData"("sample_2");
836
 
DROP TABLE IF EXISTS "geneFeatures";
837
 
CREATE TABLE "geneFeatures"(
838
 
  "gene_id" VARCHAR(45) NOT NULL,
839
 
  CONSTRAINT "fk_geneFeatures_genes1"
840
 
    FOREIGN KEY("gene_id")
841
 
    REFERENCES "genes"("gene_id")
842
 
);
843
 
CREATE INDEX "geneFeatures.fk_geneFeatures_genes1" ON "geneFeatures"("gene_id");
844
 
DROP TABLE IF EXISTS "TSSFeatures";
845
 
CREATE TABLE "TSSFeatures"(
846
 
  "TSS_group_id" VARCHAR(45) NOT NULL,
847
 
  CONSTRAINT "fk_TSSFeatures_TSS1"
848
 
    FOREIGN KEY("TSS_group_id")
849
 
    REFERENCES "TSS"("TSS_group_id")
850
 
);
851
 
CREATE INDEX "TSSFeatures.fk_TSSFeatures_TSS1" ON "TSSFeatures"("TSS_group_id");
852
 
DROP TABLE IF EXISTS "CDSFeatures";
853
 
CREATE TABLE "CDSFeatures"(
854
 
  "CDS_id" VARCHAR(45) NOT NULL,
855
 
  CONSTRAINT "fk_CDSFeatures_CDS1"
856
 
    FOREIGN KEY("CDS_id")
857
 
    REFERENCES "CDS"("CDS_id")
858
 
);
859
 
CREATE INDEX "CDSFeatures.fk_CDSFeatures_CDS1" ON "CDSFeatures"("CDS_id");
860
 
DROP TABLE IF EXISTS "geneData";
861
 
CREATE TABLE "geneData"(
862
 
  "gene_id" VARCHAR(45) NOT NULL,
863
 
  "sample_name" VARCHAR(45) NOT NULL,
864
 
  "fpkm" FLOAT,
865
 
  "conf_hi" FLOAT,
866
 
  "conf_lo" FLOAT,
867
 
  "quant_status" VARCHAR(45),
868
 
  CONSTRAINT "fk_geneData_genes1"
869
 
    FOREIGN KEY("gene_id")
870
 
    REFERENCES "genes"("gene_id"),
871
 
  CONSTRAINT "fk_geneData_samples1"
872
 
    FOREIGN KEY("sample_name")
873
 
    REFERENCES "samples"("sample_name")
874
 
);
875
 
CREATE INDEX "geneData.fk_geneData_genes1" ON "geneData"("gene_id");
876
 
CREATE INDEX "geneData.fk_geneData_samples1" ON "geneData"("sample_name");
877
 
DROP TABLE IF EXISTS "phenoData";
878
 
CREATE TABLE "phenoData"(
879
 
  "sample_name" VARCHAR(45) NOT NULL,
880
 
  "parameter" VARCHAR(45) NOT NULL,
881
 
  "value" VARCHAR(45),
882
 
  CONSTRAINT "fk_phenoData_samples"
883
 
    FOREIGN KEY("sample_name")
884
 
    REFERENCES "samples"("sample_name")
885
 
);
886
 
CREATE INDEX "phenoData.fk_phenoData_samples" ON "phenoData"("sample_name");
887
 
DROP TABLE IF EXISTS "geneExpDiffData";
888
 
CREATE TABLE "geneExpDiffData"(
889
 
  "gene_id" VARCHAR(45) NOT NULL,
890
 
  "sample_1" VARCHAR(45) NOT NULL,
891
 
  "sample_2" VARCHAR(45) NOT NULL,
892
 
  "status" VARCHAR(45),
893
 
  "value_1" FLOAT,
894
 
  "value_2" FLOAT,
895
 
  "log2_fold_change" FLOAT,
896
 
  "test_stat" FLOAT,
897
 
  "p_value" FLOAT,
898
 
  "q_value" FLOAT,
899
 
  "significant" VARCHAR(45),
900
 
  CONSTRAINT "fk_geneExpDiffData_genes1"
901
 
    FOREIGN KEY("gene_id")
902
 
    REFERENCES "genes"("gene_id"),
903
 
  CONSTRAINT "fk_geneExpDiffData_samples1"
904
 
    FOREIGN KEY("sample_1")
905
 
    REFERENCES "samples"("sample_name"),
906
 
  CONSTRAINT "fk_geneExpDiffData_samples2"
907
 
    FOREIGN KEY("sample_2")
908
 
    REFERENCES "samples"("sample_name")
909
 
);
910
 
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_genes1" ON "geneExpDiffData"("gene_id");
911
 
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_samples1" ON "geneExpDiffData"("sample_1");
912
 
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_samples2" ON "geneExpDiffData"("sample_2");
913
 
DROP TABLE IF EXISTS "isoforms";
914
 
CREATE TABLE "isoforms"(
915
 
  "isoform_id" VARCHAR(45) PRIMARY KEY NOT NULL,
916
 
  "gene_id" VARCHAR(45),
917
 
  "CDS_id" VARCHAR(45),
918
 
  "gene_short_name" VARCHAR(45),
919
 
  "TSS_group_id" VARCHAR(45),
920
 
  "class_code" VARCHAR(45),
921
 
  "nearest_ref_id" VARCHAR(45),
922
 
  "locus" VARCHAR(45),
923
 
  "length" INTEGER,
924
 
  "coverage" FLOAT,
925
 
  CONSTRAINT "fk_isoforms_TSS1"
926
 
    FOREIGN KEY("TSS_group_id")
927
 
    REFERENCES "TSS"("TSS_group_id"),
928
 
  CONSTRAINT "fk_isoforms_CDS1"
929
 
    FOREIGN KEY("CDS_id")
930
 
    REFERENCES "CDS"("CDS_id"),
931
 
  CONSTRAINT "fk_isoforms_genes1"
932
 
    FOREIGN KEY("gene_id")
933
 
    REFERENCES "genes"("gene_id")
934
 
);
935
 
CREATE INDEX "isoforms.fk_isoforms_TSS1" ON "isoforms"("TSS_group_id");
936
 
CREATE INDEX "isoforms.fk_isoforms_CDS1" ON "isoforms"("CDS_id");
937
 
CREATE INDEX "isoforms.fk_isoforms_genes1" ON "isoforms"("gene_id");
938
 
CREATE INDEX "isoforms.fk_isoforms_genes2" ON "isoforms"("gene_short_name");
939
 
DROP TABLE IF EXISTS "isoformData";
940
 
CREATE TABLE "isoformData"(
941
 
  "isoform_id" VARCHAR(45) NOT NULL,
942
 
  "sample_name" VARCHAR(45) NOT NULL,
943
 
  "fpkm" FLOAT NOT NULL,
944
 
  "conf_hi" FLOAT,
945
 
  "conf_lo" FLOAT,
946
 
  "quant_status" VARCHAR(45),
947
 
  CONSTRAINT "fk_isoformData_samples1"
948
 
    FOREIGN KEY("sample_name")
949
 
    REFERENCES "samples"("sample_name"),
950
 
  CONSTRAINT "fk_isoformData_isoforms1"
951
 
    FOREIGN KEY("isoform_id")
952
 
    REFERENCES "isoforms"("isoform_id")
953
 
);
954
 
CREATE INDEX "isoformData.fk_isoformData_samples1" ON "isoformData"("sample_name");
955
 
CREATE INDEX "isoformData.fk_isoformData_isoforms1" ON "isoformData"("isoform_id");
956
 
DROP TABLE IF EXISTS "isoformExpDiffData";
957
 
CREATE TABLE "isoformExpDiffData"(
958
 
  "isoform_id" VARCHAR(45) NOT NULL,
959
 
  "sample_1" VARCHAR(45) NOT NULL,
960
 
  "sample_2" VARCHAR(45) NOT NULL,
961
 
  "status" VARCHAR(45),
962
 
  "value_1" FLOAT,
963
 
  "value_2" FLOAT,
964
 
  "log2_fold_change" FLOAT,
965
 
  "test_stat" FLOAT,
966
 
  "p_value" FLOAT,
967
 
  "q_value" FLOAT,
968
 
  "significant" VARCHAR(45),
969
 
  CONSTRAINT "fk_isoformExpDiffData_isoforms1"
970
 
    FOREIGN KEY("isoform_id")
971
 
    REFERENCES "isoforms"("isoform_id"),
972
 
  CONSTRAINT "fk_isoformExpDiffData_samples1"
973
 
    FOREIGN KEY("sample_1")
974
 
    REFERENCES "samples"("sample_name"),
975
 
  CONSTRAINT "fk_isoformExpDiffData_samples2"
976
 
    FOREIGN KEY("sample_2")
977
 
    REFERENCES "samples"("sample_name")
978
 
);
979
 
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_isoforms1" ON "isoformExpDiffData"("isoform_id");
980
 
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_samples1" ON "isoformExpDiffData"("sample_1");
981
 
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_samples2" ON "isoformExpDiffData"("sample_2");
982
 
DROP TABLE IF EXISTS "isoformFeatures";
983
 
CREATE TABLE "isoformFeatures"(
984
 
  "isoform_id" VARCHAR(45) NOT NULL,
985
 
  CONSTRAINT "fk_isoformFeatures_isoforms1"
986
 
    FOREIGN KEY("isoform_id")
987
 
    REFERENCES "isoforms"("isoform_id")
988
 
);
989
 
CREATE INDEX "isoformFeatures.fk_isoformFeatures_isoforms1" ON "isoformFeatures"("isoform_id");
990
 
COMMIT;
991
 
 
992
 
'
993
 
                create.sql <- strsplit(schema.text, "\n")[[1]]
994
 
                create.sql <- paste(collapse="\n", create.sql)
995
 
                create.sql <- strsplit(create.sql, ";")[[1]]
996
 
                create.sql <- create.sql[-length(create.sql)] #nothing to run here
997
 
                                
998
 
                tmp <- sapply(create.sql,function(x) sqliteQuickSQL(db,x))
999
 
                db
1000
 
}
1001
 
 
1002
1023
createDB_noIndex<-function(dbFname="cuffData.db",driver="SQLite") {
1003
1024
        #Builds sqlite db at 'dbFname' and returns a dbConnect object pointing to newly created database.
1004
1025
        #No indexes are present
1016
1030
        schema.text<-'
1017
1031
-- Creator:       MySQL Workbench 5.2.33/ExportSQLite plugin 2009.12.02
1018
1032
-- Author:        Loyal Goff
 
1033
-- Caption:       New Model
 
1034
-- Project:       Name of the project
 
1035
-- Changed:       2012-04-30 22:21
1019
1036
-- Created:       2011-05-02 12:52
1020
1037
PRAGMA foreign_keys = OFF;
1021
 
PRAGMA synchronous = OFF;
 
1038
 
1022
1039
-- Schema: cuffData
1023
1040
BEGIN;
1024
1041
DROP TABLE IF EXISTS "genes";
1040
1054
);
1041
1055
DROP TABLE IF EXISTS "samples";
1042
1056
CREATE TABLE "samples"(
1043
 
  "sample_index" INTEGER PRIMARY KEY NOT NULL,
1044
 
  "sample_name" VARCHAR(45) NOT NULL
 
1057
  "sample_index" INTEGER NOT NULL,
 
1058
  "sample_name" VARCHAR(45) PRIMARY KEY NOT NULL
1045
1059
);
1046
1060
DROP TABLE IF EXISTS "TSS";
1047
1061
CREATE TABLE "TSS"(
1245
1259
    FOREIGN KEY("CDS_id")
1246
1260
    REFERENCES "CDS"("CDS_id")
1247
1261
);
 
1262
DROP TABLE IF EXISTS "model_transcripts";
 
1263
CREATE TABLE "model_transcripts"(
 
1264
  "model_transcript_id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL
 
1265
);
 
1266
DROP TABLE IF EXISTS "geneCount";
 
1267
CREATE TABLE "geneCount"(
 
1268
  "gene_id" VARCHAR(45) NOT NULL,
 
1269
  "sample_name" VARCHAR(45) NOT NULL,
 
1270
  "count" FLOAT,
 
1271
  "variance" FLOAT,
 
1272
  "uncertainty" FLOAT,
 
1273
  "dispersion" FLOAT,
 
1274
  "status" VARCHAR(45),
 
1275
  CONSTRAINT "fk_geneCount_samples1"
 
1276
    FOREIGN KEY("sample_name")
 
1277
    REFERENCES "samples"("sample_name"),
 
1278
  CONSTRAINT "fk_geneCount_genes1"
 
1279
    FOREIGN KEY("gene_id")
 
1280
    REFERENCES "genes"("gene_id")
 
1281
);
 
1282
DROP TABLE IF EXISTS "CDSCount";
 
1283
CREATE TABLE "CDSCount"(
 
1284
  "CDS_id" VARCHAR(45) NOT NULL,
 
1285
  "sample_name" VARCHAR(45) NOT NULL,
 
1286
  "count" FLOAT,
 
1287
  "variance" FLOAT,
 
1288
  "uncertainty" FLOAT,
 
1289
  "dispersion" FLOAT,
 
1290
  "status" VARCHAR(45),
 
1291
  CONSTRAINT "fk_CDSCount_CDS1"
 
1292
    FOREIGN KEY("CDS_id")
 
1293
    REFERENCES "CDS"("CDS_id"),
 
1294
  CONSTRAINT "fk_CDSCount_samples1"
 
1295
    FOREIGN KEY("sample_name")
 
1296
    REFERENCES "samples"("sample_name")
 
1297
);
 
1298
DROP TABLE IF EXISTS "TSSCount";
 
1299
CREATE TABLE "TSSCount"(
 
1300
  "TSS_group_id" VARCHAR(45) NOT NULL,
 
1301
  "sample_name" VARCHAR(45) NOT NULL,
 
1302
  "count" FLOAT,
 
1303
  "variance" FLOAT,
 
1304
  "uncertainty" FLOAT,
 
1305
  "dispersion" FLOAT,
 
1306
  "status" VARCHAR(45),
 
1307
  CONSTRAINT "fk_TSSCount_TSS1"
 
1308
    FOREIGN KEY("TSS_group_id")
 
1309
    REFERENCES "TSS"("TSS_group_id"),
 
1310
  CONSTRAINT "fk_TSSCount_samples1"
 
1311
    FOREIGN KEY("sample_name")
 
1312
    REFERENCES "samples"("sample_name")
 
1313
);
 
1314
DROP TABLE IF EXISTS "replicates";
 
1315
CREATE TABLE "replicates"(
 
1316
  "file" INTEGER NOT NULL,
 
1317
  "sample_name" VARCHAR(45) NOT NULL,
 
1318
  "replicate" INT NOT NULL,
 
1319
  "rep_name" VARCHAR(45) PRIMARY KEY NOT NULL,
 
1320
  "total_mass" FLOAT,
 
1321
  "norm_mass" FLOAT,
 
1322
  "internal_scale" FLOAT,
 
1323
  "external_scale" FLOAT,
 
1324
  CONSTRAINT "fk_replicates_samples1"
 
1325
    FOREIGN KEY("sample_name")
 
1326
    REFERENCES "samples"("sample_name")
 
1327
);
 
1328
DROP TABLE IF EXISTS "geneReplicateData";
 
1329
CREATE TABLE "geneReplicateData"(
 
1330
  "gene_id" VARCHAR(45) NOT NULL,
 
1331
  "sample_name" VARCHAR(45) NOT NULL,
 
1332
  "replicate" INTEGER,
 
1333
  "rep_name" VARCHAR(45) NOT NULL,
 
1334
  "raw_frags" FLOAT,
 
1335
  "internal_scaled_frags" FLOAT,
 
1336
  "external_scaled_frags" FLOAT,
 
1337
  "fpkm" FLOAT,
 
1338
  "effective_length" FLOAT,
 
1339
  "status" VARCHAR(45),
 
1340
  CONSTRAINT "fk_geneData_genes10"
 
1341
    FOREIGN KEY("gene_id")
 
1342
    REFERENCES "genes"("gene_id"),
 
1343
  CONSTRAINT "fk_geneReplicateData_replicates1"
 
1344
    FOREIGN KEY("rep_name")
 
1345
    REFERENCES "replicates"("rep_name"),
 
1346
  CONSTRAINT "fk_geneReplicateData_samples1"
 
1347
    FOREIGN KEY("sample_name")
 
1348
    REFERENCES "samples"("sample_name")
 
1349
);
 
1350
DROP TABLE IF EXISTS "CDSReplicateData";
 
1351
CREATE TABLE "CDSReplicateData"(
 
1352
  "CDS_id" VARCHAR(45) NOT NULL,
 
1353
  "sample_name" VARCHAR(45) NOT NULL,
 
1354
  "replicate" INTEGER,
 
1355
  "rep_name" VARCHAR(45) NOT NULL,
 
1356
  "raw_frags" FLOAT,
 
1357
  "internal_scaled_frags" FLOAT,
 
1358
  "external_scaled_frags" FLOAT,
 
1359
  "fpkm" FLOAT,
 
1360
  "effective_length" FLOAT,
 
1361
  "status" VARCHAR(45),
 
1362
  CONSTRAINT "fk_geneReplicateData_replicates100"
 
1363
    FOREIGN KEY("rep_name")
 
1364
    REFERENCES "replicates"("rep_name"),
 
1365
  CONSTRAINT "fk_CDSReplicateData_CDS1"
 
1366
    FOREIGN KEY("CDS_id")
 
1367
    REFERENCES "CDS"("CDS_id"),
 
1368
  CONSTRAINT "fk_CDSReplicateData_samples1"
 
1369
    FOREIGN KEY("sample_name")
 
1370
    REFERENCES "samples"("sample_name")
 
1371
);
 
1372
DROP TABLE IF EXISTS "TSSReplicateData";
 
1373
CREATE TABLE "TSSReplicateData"(
 
1374
  "TSS_group_id" VARCHAR(45) NOT NULL,
 
1375
  "sample_name" VARCHAR(45) NOT NULL,
 
1376
  "replicate" VARCHAR(45),
 
1377
  "rep_name" VARCHAR(45) NOT NULL,
 
1378
  "raw_frags" FLOAT,
 
1379
  "internal_scaled_frags" FLOAT,
 
1380
  "external_scaled_frags" FLOAT,
 
1381
  "fpkm" FLOAT,
 
1382
  "effective_length" FLOAT,
 
1383
  "status" VARCHAR(45),
 
1384
  CONSTRAINT "fk_geneReplicateData_replicates10000"
 
1385
    FOREIGN KEY("rep_name")
 
1386
    REFERENCES "replicates"("rep_name"),
 
1387
  CONSTRAINT "fk_TSSReplicateData_TSS1"
 
1388
    FOREIGN KEY("TSS_group_id")
 
1389
    REFERENCES "TSS"("TSS_group_id"),
 
1390
  CONSTRAINT "fk_TSSReplicateData_samples1"
 
1391
    FOREIGN KEY("sample_name")
 
1392
    REFERENCES "samples"("sample_name")
 
1393
);
 
1394
DROP TABLE IF EXISTS "runInfo";
 
1395
CREATE TABLE "runInfo"(
 
1396
  "param" VARCHAR(45),
 
1397
  "value" TEXT
 
1398
);
1248
1399
DROP TABLE IF EXISTS "geneData";
1249
1400
CREATE TABLE "geneData"(
1250
1401
  "gene_id" VARCHAR(45) NOT NULL,
1359
1510
    FOREIGN KEY("isoform_id")
1360
1511
    REFERENCES "isoforms"("isoform_id")
1361
1512
);
 
1513
DROP TABLE IF EXISTS "features";
 
1514
CREATE TABLE "features"(
 
1515
--   GTF Features (all lines/records from reference .gtf file)
 
1516
  "feature_id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
 
1517
  "gene_id" VARCHAR(45) NOT NULL,
 
1518
  "isoform_id" VARCHAR(45) NOT NULL,
 
1519
  "seqnames" VARCHAR(45) NOT NULL,
 
1520
  "source" VARCHAR(45) NOT NULL,
 
1521
  "type" INTEGER,
 
1522
  "start" INTEGER,
 
1523
  "end" INTEGER,
 
1524
  "score" FLOAT,
 
1525
  "strand" VARCHAR(45),
 
1526
  "frame" VARCHAR(45),
 
1527
  CONSTRAINT "fk_features_genes1"
 
1528
    FOREIGN KEY("gene_id")
 
1529
    REFERENCES "genes"("gene_id"),
 
1530
  CONSTRAINT "fk_features_isoforms1"
 
1531
    FOREIGN KEY("isoform_id")
 
1532
    REFERENCES "isoforms"("isoform_id")
 
1533
);
 
1534
 
 
1535
DROP TABLE IF EXISTS "attributes";
 
1536
CREATE TABLE "attributes"(
 
1537
  "attribute_lookup_id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
 
1538
  "feature_id" INTEGER NOT NULL,
 
1539
  "attribute" VARCHAR(45) NOT NULL,
 
1540
  "value" VARCHAR(45) NOT NULL,
 
1541
  CONSTRAINT "fk_attribute_lookup_features1"
 
1542
    FOREIGN KEY("feature_id")
 
1543
    REFERENCES "features"("feature_id")
 
1544
);
 
1545
DROP TABLE IF EXISTS "isoformCount";
 
1546
CREATE TABLE "isoformCount"(
 
1547
  "isoform_id" VARCHAR(45) NOT NULL,
 
1548
  "sample_name" VARCHAR(45) NOT NULL,
 
1549
  "count" FLOAT,
 
1550
  "variance" FLOAT,
 
1551
  "uncertainty" FLOAT,
 
1552
  "dispersion" FLOAT,
 
1553
  "status" VARCHAR(45),
 
1554
  CONSTRAINT "fk_isoformCount_isoforms1"
 
1555
    FOREIGN KEY("isoform_id")
 
1556
    REFERENCES "isoforms"("isoform_id"),
 
1557
  CONSTRAINT "fk_isoformCount_samples1"
 
1558
    FOREIGN KEY("sample_name")
 
1559
    REFERENCES "samples"("sample_name")
 
1560
);
 
1561
DROP TABLE IF EXISTS "isoformReplicateData";
 
1562
CREATE TABLE "isoformReplicateData"(
 
1563
  "isoform_id" VARCHAR(45) NOT NULL,
 
1564
  "sample_name" VARCHAR(45) NOT NULL,
 
1565
  "replicate" INTEGER,
 
1566
  "rep_name" VARCHAR(45) NOT NULL,
 
1567
  "raw_frags" FLOAT,
 
1568
  "internal_scaled_frags" FLOAT,
 
1569
  "external_scaled_frags" FLOAT,
 
1570
  "fpkm" FLOAT,
 
1571
  "effective_length" FLOAT,
 
1572
  "status" VARCHAR(45),
 
1573
  CONSTRAINT "fk_geneReplicateData_replicates10"
 
1574
    FOREIGN KEY("rep_name")
 
1575
    REFERENCES "replicates"("rep_name"),
 
1576
  CONSTRAINT "fk_isoformReplicateData_isoforms1"
 
1577
    FOREIGN KEY("isoform_id")
 
1578
    REFERENCES "isoforms"("isoform_id"),
 
1579
  CONSTRAINT "fk_isoformReplicateData_samples1"
 
1580
    FOREIGN KEY("sample_name")
 
1581
    REFERENCES "samples"("sample_name")
 
1582
);
1362
1583
COMMIT;
1363
1584
 
 
1585
 
1364
1586
                        '
1365
1587
        create.sql <- strsplit(schema.text, "\n")[[1]]
1366
1588
        create.sql <- paste(collapse="\n", create.sql)
1378
1600
        db <- dbConnect(drv,dbname=dbFname)
1379
1601
        
1380
1602
        index.text<-
1381
 
'CREATE INDEX "genes.gene_id_index" ON "genes"("gene_id");
1382
 
CREATE INDEX "genes.gsn_index" ON "genes"("gene_short_name");
 
1603
'CREATE INDEX "genes.gsn_index" ON "genes"("gene_short_name");
1383
1604
CREATE INDEX "genes.cc_index" ON "genes"("class_code");
1384
1605
CREATE INDEX "TSS.fk_TSS_genes1" ON "TSS"("gene_id");
1385
 
CREATE INDEX "TSS.fk_TSS_genes2" ON "TSS"("gene_short_name");
1386
1606
CREATE INDEX "TSSData.fk_TSSData_TSS1" ON "TSSData"("TSS_group_id");
1387
1607
CREATE INDEX "TSSData.fk_TSSData_samples1" ON "TSSData"("sample_name");
1388
 
CREATE INDEX "TSS.PRIMARY" ON "TSS"("TSS_group_id");
1389
 
CREATE INDEX "CDS.PRIMARY" ON "CDS"("CDS_id");
1390
1608
CREATE INDEX "CDS.fk_CDS_genes1" ON "CDS"("gene_id");
1391
 
CREATE INDEX "CDS.fk_CDS_genes2" ON "CDS"("gene_short_name");
1392
1609
CREATE INDEX "CDS.fk_CDS_TSS1" ON "CDS"("TSS_group_id");
1393
1610
CREATE INDEX "CDSData.fk_CDSData_CDS1" ON "CDSData"("CDS_id");
1394
1611
CREATE INDEX "CDSData.fk_CDSData_samples1" ON "CDSData"("sample_name");
1413
1630
CREATE INDEX "geneFeatures.fk_geneFeatures_genes1" ON "geneFeatures"("gene_id");
1414
1631
CREATE INDEX "TSSFeatures.fk_TSSFeatures_TSS1" ON "TSSFeatures"("TSS_group_id");
1415
1632
CREATE INDEX "CDSFeatures.fk_CDSFeatures_CDS1" ON "CDSFeatures"("CDS_id");
 
1633
CREATE INDEX "geneCount.fk_geneCount_samples1" ON "geneCount"("sample_name");
 
1634
CREATE INDEX "geneCount.fk_geneCount_genes1" ON "geneCount"("gene_id");
 
1635
CREATE INDEX "CDSCount.fk_CDSCount_CDS1" ON "CDSCount"("CDS_id");
 
1636
CREATE INDEX "CDSCount.fk_CDSCount_samples1" ON "CDSCount"("sample_name");
 
1637
CREATE INDEX "TSSCount.fk_TSSCount_TSS1" ON "TSSCount"("TSS_group_id");
 
1638
CREATE INDEX "TSSCount.fk_TSSCount_samples1" ON "TSSCount"("sample_name");
 
1639
CREATE INDEX "replicates.fk_replicates_samples1" ON "replicates"("sample_name");
 
1640
CREATE INDEX "geneReplicateData.fk_geneReplicateData_genes1" ON "geneReplicateData"("gene_id");
 
1641
CREATE INDEX "geneReplicateData.fk_geneReplicateData_replicates1" ON "geneReplicateData"("rep_name");
 
1642
CREATE INDEX "geneReplicateData.fk_geneReplicateData_samples1" ON "geneReplicateData"("sample_name");
 
1643
CREATE INDEX "CDSReplicateData.fk_CDSReplicateData_replicates1" ON "CDSReplicateData"("rep_name");
 
1644
CREATE INDEX "CDSReplicateData.fk_CDSReplicateData_CDS1" ON "CDSReplicateData"("CDS_id");
 
1645
CREATE INDEX "CDSReplicateData.fk_CDSReplicateData_samples1" ON "CDSReplicateData"("sample_name");
 
1646
CREATE INDEX "TSSReplicateData.fk_TSSReplicateData_replicates1" ON "TSSReplicateData"("rep_name");
 
1647
CREATE INDEX "TSSReplicateData.fk_TSSReplicateData_TSS1" ON "TSSReplicateData"("TSS_group_id");
 
1648
CREATE INDEX "TSSReplicateData.fk_TSSReplicateData_samples1" ON "TSSReplicateData"("sample_name");
1416
1649
CREATE INDEX "geneData.fk_geneData_genes1" ON "geneData"("gene_id");
1417
1650
CREATE INDEX "geneData.fk_geneData_samples1" ON "geneData"("sample_name");
1418
1651
CREATE INDEX "phenoData.fk_phenoData_samples" ON "phenoData"("sample_name");
1421
1654
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_samples2" ON "geneExpDiffData"("sample_2");
1422
1655
CREATE INDEX "geneExpDiffData.geneExpDiff_status_index" ON "geneExpDiffData"("status");
1423
1656
CREATE INDEX "geneExpDiffData.geneExpDiff_sig_index" ON "geneExpDiffData"("significant","p_value","q_value","test_stat");
1424
 
CREATE INDEX "isoforms.PRIMARY" ON "isoforms"("isoform_id");
1425
1657
CREATE INDEX "isoforms.fk_isoforms_TSS1" ON "isoforms"("TSS_group_id");
1426
1658
CREATE INDEX "isoforms.fk_isoforms_CDS1" ON "isoforms"("CDS_id");
1427
1659
CREATE INDEX "isoforms.fk_isoforms_genes1" ON "isoforms"("gene_id");
1428
 
CREATE INDEX "isoforms.fk_isoforms_genes2" ON "isoforms"("gene_short_name");
1429
1660
CREATE INDEX "isoformData.fk_isoformData_samples1" ON "isoformData"("sample_name");
1430
1661
CREATE INDEX "isoformData.fk_isoformData_isoforms1" ON "isoformData"("isoform_id");
1431
1662
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_isoforms1" ON "isoformExpDiffData"("isoform_id");
1433
1664
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_samples2" ON "isoformExpDiffData"("sample_2");
1434
1665
CREATE INDEX "isoformExpDiffData.isoformExpDiffData_sig_index" ON "isoformExpDiffData"("test_stat","p_value","q_value","significant");
1435
1666
CREATE INDEX "isoformFeatures.fk_isoformFeatures_isoforms1" ON "isoformFeatures"("isoform_id");
 
1667
CREATE INDEX "attributes.fk_attributes_feature_id" ON "attributes"("feature_id");
 
1668
CREATE INDEX "attributes.attributes_attribute_index" ON "attributes"("attribute");
 
1669
CREATE INDEX "attributes.attributes_value_index" ON "attributes"("value");
 
1670
CREATE INDEX "isoformCount.fk_isoformCount_isoforms1" ON "isoformCount"("isoform_id");
 
1671
CREATE INDEX "isoformCount.fk_isoformCount_samples1" ON "isoformCount"("sample_name");
 
1672
CREATE INDEX "isoformReplicateData.fk_isoformReplicateData_replicates1" ON "isoformReplicateData"("rep_name");
 
1673
CREATE INDEX "isoformReplicateData.fk_isoformReplicateData_isoforms1" ON "isoformReplicateData"("isoform_id");
 
1674
CREATE INDEX "isoformReplicateData.fk_isoformReplicateData_samples1" ON "isoformReplicateData"("sample_name");
 
1675
CREATE INDEX "features.features_seqnames_index" ON "features"("seqnames");
 
1676
CREATE INDEX "features.features_type_index" ON "features"("type");
 
1677
CREATE INDEX "features.features_strand_index" ON "features"("strand");
 
1678
CREATE INDEX "features.features_start_end_index" ON "features"("start","end");
 
1679
CREATE INDEX "features.fk_features_genes1" ON "features"("gene_id");
 
1680
CREATE INDEX "features.fk_features_isoforms1" ON "features"("isoform_id");
1436
1681
'
 
1682
 
1437
1683
        create.sql <- strsplit(index.text,"\n")[[1]]
1438
1684
        
1439
1685
        tmp <- sapply(create.sql,function(x){
1470
1716
#############
1471
1717
#readCufflinks
1472
1718
#############
1473
 
#TODO: Add directory pointer
 
1719
#TODO: Add count and replicate files
1474
1720
readCufflinks<-function(dir = getwd(),
1475
1721
                                                dbFile="cuffData.db",
 
1722
                                                gtfFile=NULL,
 
1723
                                                runInfoFile="run.info",
 
1724
                                                repTableFile="read_groups.info",
1476
1725
                                                geneFPKM="genes.fpkm_tracking",
1477
1726
                                                geneDiff="gene_exp.diff",
 
1727
                                                geneCount="genes.count_tracking",
 
1728
                                                geneRep="genes.read_group_tracking",
1478
1729
                                                isoformFPKM="isoforms.fpkm_tracking",
1479
1730
                                                isoformDiff="isoform_exp.diff",
 
1731
                                                isoformCount="isoforms.count_tracking",
 
1732
                                                isoformRep="isoforms.read_group_tracking",
1480
1733
                                                TSSFPKM="tss_groups.fpkm_tracking",
1481
1734
                                                TSSDiff="tss_group_exp.diff",
 
1735
                                                TSSCount="tss_groups.count_tracking",
 
1736
                                                TSSRep="tss_groups.read_group_tracking",
1482
1737
                                                CDSFPKM="cds.fpkm_tracking",
1483
1738
                                                CDSExpDiff="cds_exp.diff",
 
1739
                                                CDSCount="cds.count_tracking",
 
1740
                                                CDSRep="cds.read_group_tracking",
1484
1741
                                                CDSDiff="cds.diff",
1485
1742
                                                promoterFile="promoters.diff",
1486
1743
                                                splicingFile="splicing.diff",
1487
1744
                                                driver = "SQLite",
 
1745
                                                genome = NULL,
1488
1746
                                                rebuild = FALSE,
 
1747
                                                verbose = FALSE,
1489
1748
                                                ...){
1490
1749
        
1491
1750
        #Set file locations with directory
1492
1751
        dbFile=file.path(dir,dbFile)
 
1752
        runInfoFile=file.path(dir,runInfoFile)
 
1753
        repTableFile=file.path(dir,repTableFile)
1493
1754
        geneFPKM=file.path(dir,geneFPKM)
1494
1755
        geneDiff=file.path(dir,geneDiff)
 
1756
        geneCount=file.path(dir,geneCount)
 
1757
        geneRep=file.path(dir,geneRep)
1495
1758
        isoformFPKM=file.path(dir,isoformFPKM)
1496
1759
        isoformDiff=file.path(dir,isoformDiff)
 
1760
        isoformCount=file.path(dir,isoformCount)
 
1761
        isoformRep=file.path(dir,isoformRep)
1497
1762
        TSSFPKM=file.path(dir,TSSFPKM)
1498
1763
        TSSDiff=file.path(dir,TSSDiff)
 
1764
        TSSCount=file.path(dir,TSSCount)
 
1765
        TSSRep=file.path(dir,TSSRep)
1499
1766
        CDSFPKM=file.path(dir,CDSFPKM)
1500
1767
        CDSExpDiff=file.path(dir,CDSExpDiff)
 
1768
        CDSCount=file.path(dir,CDSCount)
 
1769
        CDSRep=file.path(dir,CDSRep)
1501
1770
        CDSDiff=file.path(dir,CDSDiff)
1502
1771
        promoterFile=file.path(dir,promoterFile)
1503
1772
        splicingFile=file.path(dir,splicingFile)
1510
1779
                dbConn<-createDB_noIndex(dbFile)
1511
1780
                
1512
1781
                #populate DB
1513
 
                                
1514
 
                loadGenes(geneFPKM,geneDiff,promoterFile,dbConn)
1515
 
                loadIsoforms(isoformFPKM,isoformDiff,dbConn)
1516
 
                loadTSS(TSSFPKM,TSSDiff,splicingFile,dbConn)
1517
 
                loadCDS(CDSFPKM,CDSExpDiff,CDSDiff,dbConn)
 
1782
                if(file.exists(runInfoFile)){
 
1783
                        loadRunInfo(runInfoFile,dbConn)
 
1784
                }
 
1785
                
 
1786
                if(file.exists(repTableFile)){
 
1787
                        loadRepTable(repTableFile,dbConn)
 
1788
                }
 
1789
                if(!is.null(gtfFile)){
 
1790
                        if(!is.null(genome)){
 
1791
                                .loadGTF(gtfFile,genome,dbConn)
 
1792
                        }else{
 
1793
                                stop("'genome' cannot be NULL if you are supplying a .gtf file!")       
 
1794
                        }
 
1795
                }
 
1796
                
 
1797
                loadGenes(geneFPKM,geneDiff,promoterFile,countFile=geneCount,replicateFile=geneRep,dbConn)
 
1798
                loadIsoforms(isoformFPKM,isoformDiff,isoformCount,isoformRep,dbConn)
 
1799
                loadTSS(TSSFPKM,TSSDiff,splicingFile,TSSCount,TSSRep,dbConn)
 
1800
                loadCDS(CDSFPKM,CDSExpDiff,CDSDiff,CDSCount,CDSRep,dbConn)
1518
1801
                
1519
1802
                #Create Indexes on DB
1520
1803
                write("Indexing Tables...",stderr())
1521
 
                createIndices(dbFile)
 
1804
                createIndices(dbFile,verbose=verbose)
1522
1805
                
1523
1806
                #load Distribution Tests
1524
1807
                #loadDistTests(promoterFile,splicingFile,CDSDiff)
1527
1810
        dbConn<-dbConnect(dbDriver(driver),dbFile)
1528
1811
        return (
1529
1812
                        new("CuffSet",DB = dbConn,
1530
 
                                        genes = new("CuffData",DB = dbConn, tables = list(mainTable = "genes",dataTable = "geneData",expDiffTable = "geneExpDiffData",featureTable = "geneFeatures"), filters = list(),type = "genes",idField = "gene_id"),
1531
 
                                        isoforms = new("CuffData", DB = dbConn, tables = list(mainTable = "isoforms",dataTable = "isoformData",expDiffTable = "isoformExpDiffData",featureTable = "isoformFeatures"), filters = list(),type="isoforms",idField = "isoform_id"),
1532
 
                                        TSS = new("CuffData", DB = dbConn, tables = list(mainTable = "TSS",dataTable = "TSSData",expDiffTable = "TSSExpDiffData",featureTable = "TSSFeatures"), filters = list(),type = "TSS",idField = "TSS_group_id"),
1533
 
                                        CDS = new("CuffData", DB = dbConn, tables = list(mainTable = "CDS",dataTable = "CDSData",expDiffTable = "CDSExpDiffData",featureTable = "CDSFeatures"), filters = list(),type = "CDS",idField = "CDS_id"),
 
1813
                                        #TODO: need to add replicate and count tables here and in AllClasses.R
 
1814
                                        
 
1815
                                        genes = new("CuffData",DB = dbConn, tables = list(mainTable = "genes",dataTable = "geneData",expDiffTable = "geneExpDiffData",featureTable = "geneFeatures",countTable="geneCount",replicateTable="geneReplicateData"), filters = list(),type = "genes",idField = "gene_id"),
 
1816
                                        isoforms = new("CuffData", DB = dbConn, tables = list(mainTable = "isoforms",dataTable = "isoformData",expDiffTable = "isoformExpDiffData",featureTable = "isoformFeatures",countTable="isoformCount",replicateTable="isoformReplicateData"), filters = list(),type="isoforms",idField = "isoform_id"),
 
1817
                                        TSS = new("CuffData", DB = dbConn, tables = list(mainTable = "TSS",dataTable = "TSSData",expDiffTable = "TSSExpDiffData",featureTable = "TSSFeatures",countTable="TSSCount",replicateTable="TSSReplicateData"), filters = list(),type = "TSS",idField = "TSS_group_id"),
 
1818
                                        CDS = new("CuffData", DB = dbConn, tables = list(mainTable = "CDS",dataTable = "CDSData",expDiffTable = "CDSExpDiffData",featureTable = "CDSFeatures",countTable="CDSCount",replicateTable="CDSReplicateData"), filters = list(),type = "CDS",idField = "CDS_id"),
1534
1819
                                        promoters = new("CuffDist", DB = dbConn, table = "promoterDiffData",type="promoter",idField="gene_id"),
1535
1820
                                        splicing = new("CuffDist", DB = dbConn, table = "splicingDiffData",type="splicing",idField="TSS_group_id"),
1536
1821
                                        relCDS = new("CuffDist", DB = dbConn, table = "CDSDiffData",type="relCDS",idField="gene_id")
1542
1827
############
1543
1828
# Handle GTF file
1544
1829
############
1545
 
loadGTF<-function(gtfFile,dbConn) {
1546
 
        
 
1830
#loadGTF<-function(gtfFile,dbConn) {
 
1831
#       
 
1832
#       #Error Trapping
 
1833
#       if (missing(gtfFile))
 
1834
#               stop("GTF file cannot be missing!")
 
1835
#       
 
1836
#       if (missing(dbConn))
 
1837
#               stop("Must provide a dbConn connection")
 
1838
#       
 
1839
#       write("Reading GTF file")
 
1840
#       gtf<-read.table(gtfFile,sep="\t",header=F)
 
1841
#       
 
1842
#       write("Melting attributes")
 
1843
#       attributes<-melt(strsplit(as.character(gtf$V9),"; "))
 
1844
#       colnames(attributes)<-c("attribute","featureID")
 
1845
#       attributes<-paste(attributes$attribute,attributes$featureID)
 
1846
#       attributes<-strsplit(as.character(attributes)," ")
 
1847
#       attributes<-as.data.frame(do.call("rbind",attributes))
 
1848
#       
 
1849
#       colnames(attributes)<-c("attribute","value","featureID")
 
1850
#       attributes<-attributes[,c(3,1,2)]
 
1851
#       
 
1852
#       #Grab only gene_ID and transcript_ID to add to features table
 
1853
#       id.attributes<-attributes[attributes$attribute %in% c("gene_id","transcript_id"),]
 
1854
#       id.attributes$featureID<-as.numeric(as.character(id.attributes$featureID))
 
1855
#       id.attributes<-dcast(id.attributes,...~attribute)
 
1856
#       
 
1857
#       #Main features table
 
1858
#       features<-gtf[,c(1:8)]
 
1859
#       colnames(features)<-c("seqname","source","type","start","end","score","strand","frame")
 
1860
#       features$featureID<-as.numeric(as.character(rownames(features)))
 
1861
#       
 
1862
#       #Merge features and id.attributes
 
1863
#       features<-merge(features,id.attributes,by.x='featureID',by.y='featureID')
 
1864
#       features<-features[,c(1,10:11,2:9)]
 
1865
#       
 
1866
#       #strip gene_id and transcript_id from attributes
 
1867
#       attributes<-attributes[!(attributes$attribute %in% c("gene_id","transcript_id")),]
 
1868
#       
 
1869
#       #Write features table
 
1870
#       write("Writing features table",stderr())
 
1871
#       #dbWriteTable(dbConn,'geneData',as.data.frame(genemelt[,c(1:2,5,3,4,6)]),row.names=F,append=T)
 
1872
#       dbWriteTable(dbConn,'features',as.data.frame(features),append=T)
 
1873
#       
 
1874
#       #Write features attribtues table
 
1875
#       #write("Writing feature attributes table",stderr())
 
1876
#       dbWriteTable(dbConn,'attributes',as.data.frame(attributes),append=T)
 
1877
#       
 
1878
#}
 
1879
 
 
1880
.loadGTF<-function(gtfFile,genomebuild,dbConn){
1547
1881
        #Error Trapping
1548
1882
        if (missing(gtfFile))
1549
1883
                stop("GTF file cannot be missing!")
1551
1885
        if (missing(dbConn))
1552
1886
                stop("Must provide a dbConn connection")
1553
1887
        
1554
 
        gtf<-read.table(gtfFile,sep="\t",header=F)
1555
 
        
1556
 
        
1557
 
        attributes<-melt(strsplit(as.character(gtf$V9),"; "))
1558
 
        colnames(attributes)<-c("attribute","featureID")
1559
 
        attributes<-paste(attributes$attribute,attributes$featureID)
1560
 
        attributes<-strsplit(as.character(attributes)," ")
1561
 
        attributes<-as.data.frame(do.call("rbind",attributes))
1562
 
        
1563
 
        colnames(attributes)<-c("attribute","value","featureID")
1564
 
        attributes<-attributes[,c(3,1,2)]
1565
 
        
1566
 
        #Grab only gene_ID and transcript_ID to add to features table
1567
 
        id.attributes<-attributes[attributes$attribute %in% c("gene_id","transcript_id"),]
1568
 
        id.attributes$featureID<-as.numeric(as.character(id.attributes$featureID))
1569
 
        id.attributes<-dcast(id.attributes,...~attribute)
1570
 
        
1571
 
        #Main features table
1572
 
        features<-gtf[,c(1:8)]
1573
 
        colnames(features)<-c("seqname","source","type","start","end","score","strand","frame")
1574
 
        features$featureID<-as.numeric(as.character(rownames(features)))
1575
 
        
1576
 
        #Merge features and id.attributes
1577
 
        features<-merge(features,id.attributes,by.x='featureID',by.y='featureID')
1578
 
        features<-features[,c(1,10:11,2:9)]
1579
 
        
1580
 
        #strip gene_id and transcript_id from attributes
1581
 
        attributes<-attributes[!(attributes$attribute %in% c("gene_id","transcript_id")),]
1582
 
        
1583
 
        #Write features table
1584
 
        write("Writing features table",stderr())
1585
 
        #dbWriteTable(dbConn,'geneData',as.data.frame(genemelt[,c(1:2,5,3,4,6)]),row.names=F,append=T)
1586
 
        dbWriteTable(dbConn,'features',as.data.frame(features),append=F)
1587
 
        
1588
 
        #Write features table
1589
 
        write("Writing feature attributes table",stderr())
1590
 
        dbWriteTable(dbConn,'attributes',as.data.frame(attributes),append=F)
1591
 
        
1592
 
}
1593
 
        
 
1888
        write("Reading GTF file",stderr())
 
1889
        gr<-import(gtfFile,asRangedData=FALSE)
 
1890
        gr<-as(gr,"data.frame")
 
1891
        #gr$genome<-genomebuild
 
1892
        colnames(gr)[grepl('^transcript_id$',colnames(gr))]<-'isoform_id'
 
1893
        colnames(gr)[grepl('^tss_id$',colnames(gr))]<-'TSS_group_id'
 
1894
        colnames(gr)[grepl('^p_id$',colnames(gr))]<-'CDS_id'
 
1895
        write("Writing GTF features to 'features' table...",stderr())
 
1896
        #dbSendQuery(dbConn,"DROP TABLE IF EXISTS 'features'")
 
1897
        #dbBeginTransaction(dbConn)
 
1898
        dbWriteTable(dbConn,'features',gr,row.names=F,overwrite=T)
 
1899
        #record Genome build
 
1900
        .recordGenome(genomebuild,dbConn)
 
1901
        #dbCommit(dbConn)
 
1902
        return()
 
1903
}
 
1904
 
 
1905
.recordGenome<-function(genome,dbConn){
 
1906
        genomeInsertQuery<-paste("INSERT INTO runInfo VALUES('genome', '",genome,"')",sep="")
 
1907
        #print(genomeInsertQuery)
 
1908
        dbSendQuery(dbConn,genomeInsertQuery)
 
1909
}
 
1910
 
 
1911
#library(Gviz)
 
1912
#myGeneId<-'XLOC_000071'
 
1913
#geneQuery<-paste("SELECT start,end,source AS feature,gene_id as gene,exon_number AS exon,transcript_id as transcript,gene_name as symbol, exon_number as rank, strand FROM features WHERE gene_id ='",myGeneId,"'",sep="")
 
1914
#geneFeatures<-dbGetQuery(cuff@DB,geneQuery)
 
1915
#geneFeatures$symbol[is.na(geneFeatures$symbol)]<-"NA"
 
1916
#grtrack<-GeneRegionTrack(geneFeatures,genome="hg19",chromosome="chr1",name="CuffDiff",showId=T,stacking="pack")
 
1917
#biomTrack<-BiomartGeneRegionTrack(genome="hg19",chromosome="chr1",start=min(start(grtrack)),end=max(end(grtrack)),name="ENSEMBL",showId=T,stacking="pack")
 
1918
#ideoTrack <- IdeogramTrack(genome = "hg19", chromosome = "chr1")
 
1919
#axTrack <- GenomeAxisTrack()
 
1920
#conservation <- UcscTrack(genome = "hg19", chromosome = "chr1",
 
1921
#               track = "Conservation", table = "phyloP46wayAll",
 
1922
#               from = min(start(grtrack)), to = max(end(grtrack)), trackType = "DataTrack",
 
1923
#               start = "start", end = "end", data = "score",
 
1924
#               type = "hist", window = "auto", col.histogram = "darkblue",
 
1925
#               fill.histogram = "darkblue", ylim = c(-3.7, 4),
 
1926
#               name = "Conservation")
 
1927
#
 
1928
#
 
1929
#plotTracks(list(ideoTrack,axTrack,grtrack,biomTrack,conservation),from=min(start(grtrack))-1000,to=max(end(grtrack))+1000)
 
1930
#plotTracks(list(axTrack,grtrack),from=min(start(grtrack))-1000,to=max(end(grtrack))+1000)
 
1931
 
 
1932
 
1594
1933
 
1595
1934
#######
1596
1935
#Unit Test