7
7
#####################
8
8
#File Archetype parsing
9
9
#####################
12
loadRunInfo<-function(runInfoFile,
15
fileArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
20
stringsAsFactors=FALSE,
25
write(paste("Reading Run Info File ",runInfoFile,sep=""),stderr())
26
fileArgs$file = runInfoFile
29
runInfo = as.data.frame(do.call(read.table,fileArgs))
34
#Load into database (runInfo table)
35
write("Writing runInfo Table",stderr())
36
insert_SQL<-'INSERT INTO runInfo VALUES(:param, :value)'
37
bulk_insert(dbConn,insert_SQL,runInfo)
42
loadRepTable<-function(repTableFile,
45
fileArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
50
stringsAsFactors=FALSE,
55
write(paste("Reading Read Group Info ",repTableFile,sep=""),stderr())
56
fileArgs$file = repTableFile
59
full = as.data.frame(do.call(read.table,fileArgs))
63
full$condition<-make.db.names(dbConn,as.character(full$condition),unique=FALSE)
66
#For now, I need to concatenate condition and replicate number
67
full$rep_name<-paste(full$condition,full$replicate_num,sep="_")
69
#Load into database (replicates table)
70
write("Writing replicates Table",stderr())
71
insert_SQL<-'INSERT INTO replicates VALUES(:file, :condition, :replicate_num, :rep_name, :total_mass, :norm_mass, :internal_scale, :external_scale)'
72
bulk_insert(dbConn,insert_SQL,full)
11
76
loadGenes<-function(fpkmFile,
16
83
#Arguments to read.* methods
17
84
fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
18
85
diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
19
86
promoterArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
87
countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
88
replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
161
230
#Handle Feature Data (this will actually be done on CuffData objects instead...but I may include something here as well)
234
#Handle Counts .count_tracking
236
if(file.exists(countFile)){
241
write(paste("Reading ", countFile,sep=""),stderr())
242
countArgs$file = countFile
243
counts<-as.data.frame(do.call(read.table,countArgs))
245
if(dim(counts)[1]>0){
246
#Reshape geneCount table
247
write("Reshaping geneCount table",stderr())
248
countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
249
colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
251
countmelt$measurement = ""
253
countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
254
countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
255
countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
256
countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
257
countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
259
countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
260
countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
261
countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
262
countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
263
countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
265
#Adjust sample names with make.db.names
266
countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
269
write("Recasting",stderr())
270
countmelt<-as.data.frame(dcast(countmelt,...~measurement))
273
#write(colnames(countmelt),stderr())
276
#Write geneCount table
277
write("Writing geneCount table",stderr())
278
insert_SQL<-'INSERT INTO geneCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
279
bulk_insert(dbConn,insert_SQL,countmelt)
281
write(paste("No records found in", countFile),stderr())
288
#Handle Replicates .rep_tracking
290
if(file.exists(replicateFile)){
294
write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
295
replicateArgs$file = replicateFile
296
reps<-as.data.frame(do.call(read.table,replicateArgs))
301
#Adjust sample names with make.db.names
302
reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
304
#Create unique rep name
305
reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
306
colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
308
#Write geneReplicateData table
309
write("Writing geneReplicateData table",stderr())
310
insert_SQL<-'INSERT INTO geneReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
311
bulk_insert(dbConn,insert_SQL,reps)
313
write(paste("No records found in", replicateFile),stderr())
167
321
loadIsoforms<-function(fpkmFile,
171
327
#Arguments to read.* methods
172
328
fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
173
329
diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
330
countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
331
replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
450
#Handle Counts .count_tracking
452
if(file.exists(countFile)){
457
write(paste("Reading ", countFile,sep=""),stderr())
458
countArgs$file = countFile
459
counts<-as.data.frame(do.call(read.table,countArgs))
461
if(dim(counts)[1]>0){
463
#Reshape isoformCount table
464
write("Reshaping isoformCount table",stderr())
465
countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
466
colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
468
countmelt$measurement = ""
470
countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
471
countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
472
countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
473
countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
474
countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
476
countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
477
countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
478
countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
479
countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
480
countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
482
#Adjust sample names with make.db.names
483
countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
487
write("Recasting",stderr())
488
countmelt<-as.data.frame(dcast(countmelt,...~measurement))
491
#write(colnames(countmelt),stderr())
494
#Write isoformCount table
495
write("Writing isoformCount table",stderr())
496
insert_SQL<-'INSERT INTO isoformCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
497
bulk_insert(dbConn,insert_SQL,countmelt)
499
write(paste("No records found in",countFile),stderr())
505
#Handle Replicates .rep_tracking
507
if(file.exists(replicateFile)){
511
write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
512
replicateArgs$file = replicateFile
513
reps<-as.data.frame(do.call(read.table,replicateArgs))
518
#Adjust sample names with make.db.names
519
reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
521
#Create unique rep name
522
reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
523
colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
525
#Write isoformReplicateData table
526
write("Writing isoformReplicateData table",stderr())
527
insert_SQL<-'INSERT INTO isoformReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
528
bulk_insert(dbConn,insert_SQL,reps)
530
write(paste("No records found in",replicateFile),stderr())
293
537
loadTSS<-function(fpkmFile,
298
544
#Arguments to read.* methods
299
545
fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
300
546
diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
301
547
splicingArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
548
countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
549
replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
687
#Handle Counts .count_tracking
689
if(file.exists(countFile)){
694
write(paste("Reading ", countFile,sep=""),stderr())
695
countArgs$file = countFile
696
counts<-as.data.frame(do.call(read.table,countArgs))
698
if(dim(counts)[1]>0){
700
#Reshape TSSCount table
701
write("Reshaping TSSCount table",stderr())
702
countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
703
colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
705
countmelt$measurement = ""
707
countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
708
countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
709
countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
710
countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
711
countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
713
countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
714
countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
715
countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
716
countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
717
countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
719
#Adjust sample names with make.db.names
720
countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
724
write("Recasting",stderr())
725
countmelt<-as.data.frame(dcast(countmelt,...~measurement))
728
#write(colnames(countmelt),stderr())
731
#Write TSSCount table
732
write("Writing TSSCount table",stderr())
733
insert_SQL<-'INSERT INTO TSSCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
734
bulk_insert(dbConn,insert_SQL,countmelt)
736
write(paste("No records found in",countFile),stderr())
742
#Handle Replicates .rep_tracking
744
if(file.exists(replicateFile)){
748
write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
749
replicateArgs$file = replicateFile
750
reps<-as.data.frame(do.call(read.table,replicateArgs))
755
#Adjust sample names with make.db.names
756
reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
758
#Create unique rep name
759
reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
760
colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
762
#Write TSSReplicateData table
763
write("Writing TSSReplicateData table",stderr())
764
insert_SQL<-'INSERT INTO TSSReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
765
bulk_insert(dbConn,insert_SQL,reps)
767
write(paste("No records found in",replicateFile),stderr())
441
775
loadCDS<-function(fpkmFile,
446
782
#Arguments to read.* methods
447
783
fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
448
784
diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
449
785
CDSDiffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
786
countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
787
replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
927
#Handle Counts .count_tracking
929
if(file.exists(countFile)){
934
write(paste("Reading ", countFile,sep=""),stderr())
935
countArgs$file = countFile
936
counts<-as.data.frame(do.call(read.table,countArgs))
938
if(dim(counts)[1]>0){
940
#Reshape CDSCount table
941
write("Reshaping CDSCount table",stderr())
942
countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
943
colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
945
countmelt$measurement = ""
947
countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
948
countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
949
countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
950
countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
951
countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
953
countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
954
countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
955
countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
956
countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
957
countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
959
#Adjust sample names with make.db.names
960
countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
964
write("Recasting",stderr())
965
countmelt<-as.data.frame(dcast(countmelt,...~measurement))
968
#write(colnames(countmelt),stderr())
971
#Write CDSCount table
972
write("Writing CDSCount table",stderr())
973
insert_SQL<-'INSERT INTO CDSCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
974
bulk_insert(dbConn,insert_SQL,countmelt)
976
write(paste("No records found in",countFile),stderr())
982
#Handle Replicates .rep_tracking
984
if(file.exists(replicateFile)){
988
write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
989
replicateArgs$file = replicateFile
990
reps<-as.data.frame(do.call(read.table,replicateArgs))
995
#Adjust sample names with make.db.names
996
reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
998
#Create unique rep name
999
reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
1000
colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
1002
#Write CDSReplicateData table
1003
write("Writing CDSReplicateData table",stderr())
1004
insert_SQL<-'INSERT INTO CDSReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
1005
bulk_insert(dbConn,insert_SQL,reps)
1007
write(paste("No records found in",replicateFile),stderr())
590
1014
########################
596
1020
#Database Setup Functions
597
1021
#####################
599
createDB<-function(dbFname="cuffData.db",driver="SQLite") {
600
#Builds sqlite db at 'dbFname' and returns a dbConnect object pointing to newly created database.
601
#May be deprecated if I can load first and index later...
603
drv<-dbDriver(driver)
604
db <- dbConnect(drv,dbname=dbFname)
607
PRAGMA foreign_keys = OFF;
608
PRAGMA synchronous = OFF;
609
PRAGMA journal_mode = MEMORY;
611
DROP TABLE IF EXISTS "genes";
612
CREATE TABLE "genes"(
613
"gene_id" VARCHAR(45) PRIMARY KEY NOT NULL,
614
"class_code" VARCHAR(45),
615
"nearest_ref_id" VARCHAR(45),
616
"gene_short_name" VARCHAR(45),
621
DROP TABLE IF EXISTS "biasData";
622
CREATE TABLE "biasData"(
623
"biasData_id" INTEGER PRIMARY KEY NOT NULL
625
DROP TABLE IF EXISTS "samples";
626
CREATE TABLE "samples"(
627
"sample_index" INTEGER PRIMARY KEY NOT NULL,
628
"sample_name" VARCHAR(45) NOT NULL
630
DROP TABLE IF EXISTS "TSS";
632
"TSS_group_id" VARCHAR(45) PRIMARY KEY NOT NULL,
633
"class_code" VARCHAR(45),
634
"nearest_ref_id" VARCHAR(45),
635
"gene_id" VARCHAR(45) NOT NULL,
636
"gene_short_name" VARCHAR(45),
640
CONSTRAINT "fk_TSS_genes1"
641
FOREIGN KEY("gene_id")
642
REFERENCES "genes"("gene_id")
644
CREATE INDEX "TSS.fk_TSS_genes1" ON "TSS"("gene_id");
645
CREATE INDEX "TSS.fk_TSS_genes2" ON "TSS"("gene_short_name");
646
DROP TABLE IF EXISTS "TSSData";
647
CREATE TABLE "TSSData"(
648
"TSS_group_id" VARCHAR(45) NOT NULL,
649
"sample_name" VARCHAR(45) NOT NULL,
653
"quant_status" VARCHAR(45),
654
CONSTRAINT "fk_TSSData_TSS1"
655
FOREIGN KEY("TSS_group_id")
656
REFERENCES "TSS"("TSS_group_id"),
657
CONSTRAINT "fk_TSSData_samples1"
658
FOREIGN KEY("sample_name")
659
REFERENCES "samples"("sample_name")
661
CREATE INDEX "TSSData.fk_TSSData_TSS1" ON "TSSData"("TSS_group_id");
662
CREATE INDEX "TSSData.fk_TSSData_samples1" ON "TSSData"("sample_name");
663
DROP TABLE IF EXISTS "CDS";
665
"CDS_id" VARCHAR(45) PRIMARY KEY NOT NULL,
666
"class_code" VARCHAR(45),
667
"nearest_ref_id" VARCHAR(45),
668
"gene_id" VARCHAR(45),
669
"gene_short_name" VARCHAR(45),
670
"TSS_group_id" VARCHAR(45),
674
CONSTRAINT "fk_CDS_genes1"
675
FOREIGN KEY("gene_id")
676
REFERENCES "genes"("gene_id"),
677
CONSTRAINT "fk_CDS_TSS1"
678
FOREIGN KEY("TSS_group_id")
679
REFERENCES "TSS"("TSS_group_id")
681
CREATE INDEX "CDS.fk_CDS_genes1" ON "CDS"("gene_id");
682
CREATE INDEX "CDS.fk_CDS_genes2" ON "CDS"("gene_short_name");
683
CREATE INDEX "CDS.fk_CDS_TSS1" ON "CDS"("TSS_group_id");
684
DROP TABLE IF EXISTS "CDSData";
685
CREATE TABLE "CDSData"(
686
"CDS_id" VARCHAR(45) NOT NULL,
687
"sample_name" VARCHAR(45) NOT NULL,
691
"quant_status" VARCHAR(45),
692
CONSTRAINT "fk_CDSData_CDS1"
693
FOREIGN KEY("CDS_id")
694
REFERENCES "CDS"("CDS_id"),
695
CONSTRAINT "fk_CDSData_samples1"
696
FOREIGN KEY("sample_name")
697
REFERENCES "samples"("sample_name")
699
CREATE INDEX "CDSData.fk_CDSData_CDS1" ON "CDSData"("CDS_id");
700
CREATE INDEX "CDSData.fk_CDSData_samples1" ON "CDSData"("sample_name");
701
DROP TABLE IF EXISTS "splicingDiffData";
702
CREATE TABLE "splicingDiffData"(
703
"TSS_group_id" VARCHAR(45) NOT NULL,
704
"gene_id" VARCHAR(45) NOT NULL,
705
"sample_1" VARCHAR(45) NOT NULL,
706
"sample_2" VARCHAR(45) NOT NULL,
707
"status" VARCHAR(45),
714
"significant" VARCHAR(45),
715
CONSTRAINT "fk_splicingDiffData_samples1"
716
FOREIGN KEY("sample_1")
717
REFERENCES "samples"("sample_name"),
718
CONSTRAINT "fk_splicingDiffData_samples2"
719
FOREIGN KEY("sample_2")
720
REFERENCES "samples"("sample_name"),
721
CONSTRAINT "fk_splicingDiffData_TSS1"
722
FOREIGN KEY("TSS_group_id")
723
REFERENCES "TSS"("TSS_group_id"),
724
CONSTRAINT "fk_splicingDiffData_genes1"
725
FOREIGN KEY("gene_id")
726
REFERENCES "genes"("gene_id")
728
CREATE INDEX "splicingDiffData.fk_splicingDiffData_samples1" ON "splicingDiffData"("sample_1");
729
CREATE INDEX "splicingDiffData.fk_splicingDiffData_samples2" ON "splicingDiffData"("sample_2");
730
CREATE INDEX "splicingDiffData.fk_splicingDiffData_TSS1" ON "splicingDiffData"("TSS_group_id");
731
CREATE INDEX "splicingDiffData.fk_splicingDiffData_genes1" ON "splicingDiffData"("gene_id");
732
DROP TABLE IF EXISTS "TSSExpDiffData";
733
CREATE TABLE "TSSExpDiffData"(
734
"TSS_group_id" VARCHAR(45) NOT NULL,
735
"sample_1" VARCHAR(45) NOT NULL,
736
"sample_2" VARCHAR(45) NOT NULL,
737
"status" VARCHAR(45),
740
"log2_fold_change" FLOAT,
744
"significant" VARCHAR(45),
745
CONSTRAINT "fk_TSSExpDiffData_TSS1"
746
FOREIGN KEY("TSS_group_id")
747
REFERENCES "TSS"("TSS_group_id"),
748
CONSTRAINT "fk_TSSExpDiffData_samples1"
749
FOREIGN KEY("sample_1")
750
REFERENCES "samples"("sample_name"),
751
CONSTRAINT "fk_TSSExpDiffData_samples2"
752
FOREIGN KEY("sample_2")
753
REFERENCES "samples"("sample_name")
755
CREATE INDEX "TSSExpDiffData.fk_TSSExpDiffData_TSS1" ON "TSSExpDiffData"("TSS_group_id");
756
CREATE INDEX "TSSExpDiffData.fk_TSSExpDiffData_samples1" ON "TSSExpDiffData"("sample_1");
757
CREATE INDEX "TSSExpDiffData.fk_TSSExpDiffData_samples2" ON "TSSExpDiffData"("sample_2");
758
DROP TABLE IF EXISTS "CDSDiffData";
759
CREATE TABLE "CDSDiffData"(
760
"gene_id" VARCHAR(45) NOT NULL,
761
"sample_1" VARCHAR(45) NOT NULL,
762
"sample_2" VARCHAR(45) NOT NULL,
763
"status" VARCHAR(45),
770
"significant" VARCHAR(45),
771
CONSTRAINT "fk_CDSDiffData_samples1"
772
FOREIGN KEY("sample_1")
773
REFERENCES "samples"("sample_name"),
774
CONSTRAINT "fk_CDSDiffData_samples2"
775
FOREIGN KEY("sample_2")
776
REFERENCES "samples"("sample_name"),
777
CONSTRAINT "fk_CDSDiffData_genes1"
778
FOREIGN KEY("gene_id")
779
REFERENCES "genes"("gene_id")
781
CREATE INDEX "CDSDiffData.fk_CDSDiffData_samples1" ON "CDSDiffData"("sample_1");
782
CREATE INDEX "CDSDiffData.fk_CDSDiffData_samples2" ON "CDSDiffData"("sample_2");
783
CREATE INDEX "CDSDiffData.fk_CDSDiffData_genes1" ON "CDSDiffData"("gene_id");
784
DROP TABLE IF EXISTS "CDSExpDiffData";
785
CREATE TABLE "CDSExpDiffData"(
786
"CDS_id" VARCHAR(45) NOT NULL,
787
"sample_1" VARCHAR(45) NOT NULL,
788
"sample_2" VARCHAR(45) NOT NULL,
789
"status" VARCHAR(45),
792
"log2_fold_change" FLOAT,
796
"significant" VARCHAR(45),
797
CONSTRAINT "fk_CDSExpDiffData_CDS1"
798
FOREIGN KEY("CDS_id")
799
REFERENCES "CDS"("CDS_id"),
800
CONSTRAINT "fk_CDSExpDiffData_samples1"
801
FOREIGN KEY("sample_1")
802
REFERENCES "samples"("sample_name"),
803
CONSTRAINT "fk_CDSExpDiffData_samples2"
804
FOREIGN KEY("sample_2")
805
REFERENCES "samples"("sample_name")
807
CREATE INDEX "CDSExpDiffData.fk_CDSExpDiffData_CDS1" ON "CDSExpDiffData"("CDS_id");
808
CREATE INDEX "CDSExpDiffData.fk_CDSExpDiffData_samples1" ON "CDSExpDiffData"("sample_1");
809
CREATE INDEX "CDSExpDiffData.fk_CDSExpDiffData_samples2" ON "CDSExpDiffData"("sample_2");
810
DROP TABLE IF EXISTS "promoterDiffData";
811
CREATE TABLE "promoterDiffData"(
812
"gene_id" VARCHAR(45) NOT NULL,
813
"sample_1" VARCHAR(45) NOT NULL,
814
"sample_2" VARCHAR(45) NOT NULL,
815
"status" VARCHAR(45),
822
"significant" VARCHAR(45),
823
CONSTRAINT "fk_promoterDiffData_genes1"
824
FOREIGN KEY("gene_id")
825
REFERENCES "genes"("gene_id"),
826
CONSTRAINT "fk_promoterDiffData_samples1"
827
FOREIGN KEY("sample_1")
828
REFERENCES "samples"("sample_name"),
829
CONSTRAINT "fk_promoterDiffData_samples2"
830
FOREIGN KEY("sample_2")
831
REFERENCES "samples"("sample_name")
833
CREATE INDEX "promoterDiffData.fk_promoterDiffData_genes1" ON "promoterDiffData"("gene_id");
834
CREATE INDEX "promoterDiffData.fk_promoterDiffData_samples1" ON "promoterDiffData"("sample_1");
835
CREATE INDEX "promoterDiffData.fk_promoterDiffData_samples2" ON "promoterDiffData"("sample_2");
836
DROP TABLE IF EXISTS "geneFeatures";
837
CREATE TABLE "geneFeatures"(
838
"gene_id" VARCHAR(45) NOT NULL,
839
CONSTRAINT "fk_geneFeatures_genes1"
840
FOREIGN KEY("gene_id")
841
REFERENCES "genes"("gene_id")
843
CREATE INDEX "geneFeatures.fk_geneFeatures_genes1" ON "geneFeatures"("gene_id");
844
DROP TABLE IF EXISTS "TSSFeatures";
845
CREATE TABLE "TSSFeatures"(
846
"TSS_group_id" VARCHAR(45) NOT NULL,
847
CONSTRAINT "fk_TSSFeatures_TSS1"
848
FOREIGN KEY("TSS_group_id")
849
REFERENCES "TSS"("TSS_group_id")
851
CREATE INDEX "TSSFeatures.fk_TSSFeatures_TSS1" ON "TSSFeatures"("TSS_group_id");
852
DROP TABLE IF EXISTS "CDSFeatures";
853
CREATE TABLE "CDSFeatures"(
854
"CDS_id" VARCHAR(45) NOT NULL,
855
CONSTRAINT "fk_CDSFeatures_CDS1"
856
FOREIGN KEY("CDS_id")
857
REFERENCES "CDS"("CDS_id")
859
CREATE INDEX "CDSFeatures.fk_CDSFeatures_CDS1" ON "CDSFeatures"("CDS_id");
860
DROP TABLE IF EXISTS "geneData";
861
CREATE TABLE "geneData"(
862
"gene_id" VARCHAR(45) NOT NULL,
863
"sample_name" VARCHAR(45) NOT NULL,
867
"quant_status" VARCHAR(45),
868
CONSTRAINT "fk_geneData_genes1"
869
FOREIGN KEY("gene_id")
870
REFERENCES "genes"("gene_id"),
871
CONSTRAINT "fk_geneData_samples1"
872
FOREIGN KEY("sample_name")
873
REFERENCES "samples"("sample_name")
875
CREATE INDEX "geneData.fk_geneData_genes1" ON "geneData"("gene_id");
876
CREATE INDEX "geneData.fk_geneData_samples1" ON "geneData"("sample_name");
877
DROP TABLE IF EXISTS "phenoData";
878
CREATE TABLE "phenoData"(
879
"sample_name" VARCHAR(45) NOT NULL,
880
"parameter" VARCHAR(45) NOT NULL,
882
CONSTRAINT "fk_phenoData_samples"
883
FOREIGN KEY("sample_name")
884
REFERENCES "samples"("sample_name")
886
CREATE INDEX "phenoData.fk_phenoData_samples" ON "phenoData"("sample_name");
887
DROP TABLE IF EXISTS "geneExpDiffData";
888
CREATE TABLE "geneExpDiffData"(
889
"gene_id" VARCHAR(45) NOT NULL,
890
"sample_1" VARCHAR(45) NOT NULL,
891
"sample_2" VARCHAR(45) NOT NULL,
892
"status" VARCHAR(45),
895
"log2_fold_change" FLOAT,
899
"significant" VARCHAR(45),
900
CONSTRAINT "fk_geneExpDiffData_genes1"
901
FOREIGN KEY("gene_id")
902
REFERENCES "genes"("gene_id"),
903
CONSTRAINT "fk_geneExpDiffData_samples1"
904
FOREIGN KEY("sample_1")
905
REFERENCES "samples"("sample_name"),
906
CONSTRAINT "fk_geneExpDiffData_samples2"
907
FOREIGN KEY("sample_2")
908
REFERENCES "samples"("sample_name")
910
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_genes1" ON "geneExpDiffData"("gene_id");
911
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_samples1" ON "geneExpDiffData"("sample_1");
912
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_samples2" ON "geneExpDiffData"("sample_2");
913
DROP TABLE IF EXISTS "isoforms";
914
CREATE TABLE "isoforms"(
915
"isoform_id" VARCHAR(45) PRIMARY KEY NOT NULL,
916
"gene_id" VARCHAR(45),
917
"CDS_id" VARCHAR(45),
918
"gene_short_name" VARCHAR(45),
919
"TSS_group_id" VARCHAR(45),
920
"class_code" VARCHAR(45),
921
"nearest_ref_id" VARCHAR(45),
925
CONSTRAINT "fk_isoforms_TSS1"
926
FOREIGN KEY("TSS_group_id")
927
REFERENCES "TSS"("TSS_group_id"),
928
CONSTRAINT "fk_isoforms_CDS1"
929
FOREIGN KEY("CDS_id")
930
REFERENCES "CDS"("CDS_id"),
931
CONSTRAINT "fk_isoforms_genes1"
932
FOREIGN KEY("gene_id")
933
REFERENCES "genes"("gene_id")
935
CREATE INDEX "isoforms.fk_isoforms_TSS1" ON "isoforms"("TSS_group_id");
936
CREATE INDEX "isoforms.fk_isoforms_CDS1" ON "isoforms"("CDS_id");
937
CREATE INDEX "isoforms.fk_isoforms_genes1" ON "isoforms"("gene_id");
938
CREATE INDEX "isoforms.fk_isoforms_genes2" ON "isoforms"("gene_short_name");
939
DROP TABLE IF EXISTS "isoformData";
940
CREATE TABLE "isoformData"(
941
"isoform_id" VARCHAR(45) NOT NULL,
942
"sample_name" VARCHAR(45) NOT NULL,
943
"fpkm" FLOAT NOT NULL,
946
"quant_status" VARCHAR(45),
947
CONSTRAINT "fk_isoformData_samples1"
948
FOREIGN KEY("sample_name")
949
REFERENCES "samples"("sample_name"),
950
CONSTRAINT "fk_isoformData_isoforms1"
951
FOREIGN KEY("isoform_id")
952
REFERENCES "isoforms"("isoform_id")
954
CREATE INDEX "isoformData.fk_isoformData_samples1" ON "isoformData"("sample_name");
955
CREATE INDEX "isoformData.fk_isoformData_isoforms1" ON "isoformData"("isoform_id");
956
DROP TABLE IF EXISTS "isoformExpDiffData";
957
CREATE TABLE "isoformExpDiffData"(
958
"isoform_id" VARCHAR(45) NOT NULL,
959
"sample_1" VARCHAR(45) NOT NULL,
960
"sample_2" VARCHAR(45) NOT NULL,
961
"status" VARCHAR(45),
964
"log2_fold_change" FLOAT,
968
"significant" VARCHAR(45),
969
CONSTRAINT "fk_isoformExpDiffData_isoforms1"
970
FOREIGN KEY("isoform_id")
971
REFERENCES "isoforms"("isoform_id"),
972
CONSTRAINT "fk_isoformExpDiffData_samples1"
973
FOREIGN KEY("sample_1")
974
REFERENCES "samples"("sample_name"),
975
CONSTRAINT "fk_isoformExpDiffData_samples2"
976
FOREIGN KEY("sample_2")
977
REFERENCES "samples"("sample_name")
979
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_isoforms1" ON "isoformExpDiffData"("isoform_id");
980
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_samples1" ON "isoformExpDiffData"("sample_1");
981
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_samples2" ON "isoformExpDiffData"("sample_2");
982
DROP TABLE IF EXISTS "isoformFeatures";
983
CREATE TABLE "isoformFeatures"(
984
"isoform_id" VARCHAR(45) NOT NULL,
985
CONSTRAINT "fk_isoformFeatures_isoforms1"
986
FOREIGN KEY("isoform_id")
987
REFERENCES "isoforms"("isoform_id")
989
CREATE INDEX "isoformFeatures.fk_isoformFeatures_isoforms1" ON "isoformFeatures"("isoform_id");
993
create.sql <- strsplit(schema.text, "\n")[[1]]
994
create.sql <- paste(collapse="\n", create.sql)
995
create.sql <- strsplit(create.sql, ";")[[1]]
996
create.sql <- create.sql[-length(create.sql)] #nothing to run here
998
tmp <- sapply(create.sql,function(x) sqliteQuickSQL(db,x))
1002
1023
createDB_noIndex<-function(dbFname="cuffData.db",driver="SQLite") {
1003
1024
#Builds sqlite db at 'dbFname' and returns a dbConnect object pointing to newly created database.
1004
1025
#No indexes are present
1245
1259
FOREIGN KEY("CDS_id")
1246
1260
REFERENCES "CDS"("CDS_id")
1262
DROP TABLE IF EXISTS "model_transcripts";
1263
CREATE TABLE "model_transcripts"(
1264
"model_transcript_id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL
1266
DROP TABLE IF EXISTS "geneCount";
1267
CREATE TABLE "geneCount"(
1268
"gene_id" VARCHAR(45) NOT NULL,
1269
"sample_name" VARCHAR(45) NOT NULL,
1272
"uncertainty" FLOAT,
1274
"status" VARCHAR(45),
1275
CONSTRAINT "fk_geneCount_samples1"
1276
FOREIGN KEY("sample_name")
1277
REFERENCES "samples"("sample_name"),
1278
CONSTRAINT "fk_geneCount_genes1"
1279
FOREIGN KEY("gene_id")
1280
REFERENCES "genes"("gene_id")
1282
DROP TABLE IF EXISTS "CDSCount";
1283
CREATE TABLE "CDSCount"(
1284
"CDS_id" VARCHAR(45) NOT NULL,
1285
"sample_name" VARCHAR(45) NOT NULL,
1288
"uncertainty" FLOAT,
1290
"status" VARCHAR(45),
1291
CONSTRAINT "fk_CDSCount_CDS1"
1292
FOREIGN KEY("CDS_id")
1293
REFERENCES "CDS"("CDS_id"),
1294
CONSTRAINT "fk_CDSCount_samples1"
1295
FOREIGN KEY("sample_name")
1296
REFERENCES "samples"("sample_name")
1298
DROP TABLE IF EXISTS "TSSCount";
1299
CREATE TABLE "TSSCount"(
1300
"TSS_group_id" VARCHAR(45) NOT NULL,
1301
"sample_name" VARCHAR(45) NOT NULL,
1304
"uncertainty" FLOAT,
1306
"status" VARCHAR(45),
1307
CONSTRAINT "fk_TSSCount_TSS1"
1308
FOREIGN KEY("TSS_group_id")
1309
REFERENCES "TSS"("TSS_group_id"),
1310
CONSTRAINT "fk_TSSCount_samples1"
1311
FOREIGN KEY("sample_name")
1312
REFERENCES "samples"("sample_name")
1314
DROP TABLE IF EXISTS "replicates";
1315
CREATE TABLE "replicates"(
1316
"file" INTEGER NOT NULL,
1317
"sample_name" VARCHAR(45) NOT NULL,
1318
"replicate" INT NOT NULL,
1319
"rep_name" VARCHAR(45) PRIMARY KEY NOT NULL,
1322
"internal_scale" FLOAT,
1323
"external_scale" FLOAT,
1324
CONSTRAINT "fk_replicates_samples1"
1325
FOREIGN KEY("sample_name")
1326
REFERENCES "samples"("sample_name")
1328
DROP TABLE IF EXISTS "geneReplicateData";
1329
CREATE TABLE "geneReplicateData"(
1330
"gene_id" VARCHAR(45) NOT NULL,
1331
"sample_name" VARCHAR(45) NOT NULL,
1332
"replicate" INTEGER,
1333
"rep_name" VARCHAR(45) NOT NULL,
1335
"internal_scaled_frags" FLOAT,
1336
"external_scaled_frags" FLOAT,
1338
"effective_length" FLOAT,
1339
"status" VARCHAR(45),
1340
CONSTRAINT "fk_geneData_genes10"
1341
FOREIGN KEY("gene_id")
1342
REFERENCES "genes"("gene_id"),
1343
CONSTRAINT "fk_geneReplicateData_replicates1"
1344
FOREIGN KEY("rep_name")
1345
REFERENCES "replicates"("rep_name"),
1346
CONSTRAINT "fk_geneReplicateData_samples1"
1347
FOREIGN KEY("sample_name")
1348
REFERENCES "samples"("sample_name")
1350
DROP TABLE IF EXISTS "CDSReplicateData";
1351
CREATE TABLE "CDSReplicateData"(
1352
"CDS_id" VARCHAR(45) NOT NULL,
1353
"sample_name" VARCHAR(45) NOT NULL,
1354
"replicate" INTEGER,
1355
"rep_name" VARCHAR(45) NOT NULL,
1357
"internal_scaled_frags" FLOAT,
1358
"external_scaled_frags" FLOAT,
1360
"effective_length" FLOAT,
1361
"status" VARCHAR(45),
1362
CONSTRAINT "fk_geneReplicateData_replicates100"
1363
FOREIGN KEY("rep_name")
1364
REFERENCES "replicates"("rep_name"),
1365
CONSTRAINT "fk_CDSReplicateData_CDS1"
1366
FOREIGN KEY("CDS_id")
1367
REFERENCES "CDS"("CDS_id"),
1368
CONSTRAINT "fk_CDSReplicateData_samples1"
1369
FOREIGN KEY("sample_name")
1370
REFERENCES "samples"("sample_name")
1372
DROP TABLE IF EXISTS "TSSReplicateData";
1373
CREATE TABLE "TSSReplicateData"(
1374
"TSS_group_id" VARCHAR(45) NOT NULL,
1375
"sample_name" VARCHAR(45) NOT NULL,
1376
"replicate" VARCHAR(45),
1377
"rep_name" VARCHAR(45) NOT NULL,
1379
"internal_scaled_frags" FLOAT,
1380
"external_scaled_frags" FLOAT,
1382
"effective_length" FLOAT,
1383
"status" VARCHAR(45),
1384
CONSTRAINT "fk_geneReplicateData_replicates10000"
1385
FOREIGN KEY("rep_name")
1386
REFERENCES "replicates"("rep_name"),
1387
CONSTRAINT "fk_TSSReplicateData_TSS1"
1388
FOREIGN KEY("TSS_group_id")
1389
REFERENCES "TSS"("TSS_group_id"),
1390
CONSTRAINT "fk_TSSReplicateData_samples1"
1391
FOREIGN KEY("sample_name")
1392
REFERENCES "samples"("sample_name")
1394
DROP TABLE IF EXISTS "runInfo";
1395
CREATE TABLE "runInfo"(
1396
"param" VARCHAR(45),
1248
1399
DROP TABLE IF EXISTS "geneData";
1249
1400
CREATE TABLE "geneData"(
1250
1401
"gene_id" VARCHAR(45) NOT NULL,
1359
1510
FOREIGN KEY("isoform_id")
1360
1511
REFERENCES "isoforms"("isoform_id")
1513
DROP TABLE IF EXISTS "features";
1514
CREATE TABLE "features"(
1515
-- GTF Features (all lines/records from reference .gtf file)
1516
"feature_id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
1517
"gene_id" VARCHAR(45) NOT NULL,
1518
"isoform_id" VARCHAR(45) NOT NULL,
1519
"seqnames" VARCHAR(45) NOT NULL,
1520
"source" VARCHAR(45) NOT NULL,
1525
"strand" VARCHAR(45),
1526
"frame" VARCHAR(45),
1527
CONSTRAINT "fk_features_genes1"
1528
FOREIGN KEY("gene_id")
1529
REFERENCES "genes"("gene_id"),
1530
CONSTRAINT "fk_features_isoforms1"
1531
FOREIGN KEY("isoform_id")
1532
REFERENCES "isoforms"("isoform_id")
1535
DROP TABLE IF EXISTS "attributes";
1536
CREATE TABLE "attributes"(
1537
"attribute_lookup_id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
1538
"feature_id" INTEGER NOT NULL,
1539
"attribute" VARCHAR(45) NOT NULL,
1540
"value" VARCHAR(45) NOT NULL,
1541
CONSTRAINT "fk_attribute_lookup_features1"
1542
FOREIGN KEY("feature_id")
1543
REFERENCES "features"("feature_id")
1545
DROP TABLE IF EXISTS "isoformCount";
1546
CREATE TABLE "isoformCount"(
1547
"isoform_id" VARCHAR(45) NOT NULL,
1548
"sample_name" VARCHAR(45) NOT NULL,
1551
"uncertainty" FLOAT,
1553
"status" VARCHAR(45),
1554
CONSTRAINT "fk_isoformCount_isoforms1"
1555
FOREIGN KEY("isoform_id")
1556
REFERENCES "isoforms"("isoform_id"),
1557
CONSTRAINT "fk_isoformCount_samples1"
1558
FOREIGN KEY("sample_name")
1559
REFERENCES "samples"("sample_name")
1561
DROP TABLE IF EXISTS "isoformReplicateData";
1562
CREATE TABLE "isoformReplicateData"(
1563
"isoform_id" VARCHAR(45) NOT NULL,
1564
"sample_name" VARCHAR(45) NOT NULL,
1565
"replicate" INTEGER,
1566
"rep_name" VARCHAR(45) NOT NULL,
1568
"internal_scaled_frags" FLOAT,
1569
"external_scaled_frags" FLOAT,
1571
"effective_length" FLOAT,
1572
"status" VARCHAR(45),
1573
CONSTRAINT "fk_geneReplicateData_replicates10"
1574
FOREIGN KEY("rep_name")
1575
REFERENCES "replicates"("rep_name"),
1576
CONSTRAINT "fk_isoformReplicateData_isoforms1"
1577
FOREIGN KEY("isoform_id")
1578
REFERENCES "isoforms"("isoform_id"),
1579
CONSTRAINT "fk_isoformReplicateData_samples1"
1580
FOREIGN KEY("sample_name")
1581
REFERENCES "samples"("sample_name")
1365
1587
create.sql <- strsplit(schema.text, "\n")[[1]]
1366
1588
create.sql <- paste(collapse="\n", create.sql)
1473
#TODO: Add directory pointer
1719
#TODO: Add count and replicate files
1474
1720
readCufflinks<-function(dir = getwd(),
1475
1721
dbFile="cuffData.db",
1723
runInfoFile="run.info",
1724
repTableFile="read_groups.info",
1476
1725
geneFPKM="genes.fpkm_tracking",
1477
1726
geneDiff="gene_exp.diff",
1727
geneCount="genes.count_tracking",
1728
geneRep="genes.read_group_tracking",
1478
1729
isoformFPKM="isoforms.fpkm_tracking",
1479
1730
isoformDiff="isoform_exp.diff",
1731
isoformCount="isoforms.count_tracking",
1732
isoformRep="isoforms.read_group_tracking",
1480
1733
TSSFPKM="tss_groups.fpkm_tracking",
1481
1734
TSSDiff="tss_group_exp.diff",
1735
TSSCount="tss_groups.count_tracking",
1736
TSSRep="tss_groups.read_group_tracking",
1482
1737
CDSFPKM="cds.fpkm_tracking",
1483
1738
CDSExpDiff="cds_exp.diff",
1739
CDSCount="cds.count_tracking",
1740
CDSRep="cds.read_group_tracking",
1484
1741
CDSDiff="cds.diff",
1485
1742
promoterFile="promoters.diff",
1486
1743
splicingFile="splicing.diff",
1487
1744
driver = "SQLite",
1488
1746
rebuild = FALSE,
1491
1750
#Set file locations with directory
1492
1751
dbFile=file.path(dir,dbFile)
1752
runInfoFile=file.path(dir,runInfoFile)
1753
repTableFile=file.path(dir,repTableFile)
1493
1754
geneFPKM=file.path(dir,geneFPKM)
1494
1755
geneDiff=file.path(dir,geneDiff)
1756
geneCount=file.path(dir,geneCount)
1757
geneRep=file.path(dir,geneRep)
1495
1758
isoformFPKM=file.path(dir,isoformFPKM)
1496
1759
isoformDiff=file.path(dir,isoformDiff)
1760
isoformCount=file.path(dir,isoformCount)
1761
isoformRep=file.path(dir,isoformRep)
1497
1762
TSSFPKM=file.path(dir,TSSFPKM)
1498
1763
TSSDiff=file.path(dir,TSSDiff)
1764
TSSCount=file.path(dir,TSSCount)
1765
TSSRep=file.path(dir,TSSRep)
1499
1766
CDSFPKM=file.path(dir,CDSFPKM)
1500
1767
CDSExpDiff=file.path(dir,CDSExpDiff)
1768
CDSCount=file.path(dir,CDSCount)
1769
CDSRep=file.path(dir,CDSRep)
1501
1770
CDSDiff=file.path(dir,CDSDiff)
1502
1771
promoterFile=file.path(dir,promoterFile)
1503
1772
splicingFile=file.path(dir,splicingFile)
1527
1810
dbConn<-dbConnect(dbDriver(driver),dbFile)
1529
1812
new("CuffSet",DB = dbConn,
1530
genes = new("CuffData",DB = dbConn, tables = list(mainTable = "genes",dataTable = "geneData",expDiffTable = "geneExpDiffData",featureTable = "geneFeatures"), filters = list(),type = "genes",idField = "gene_id"),
1531
isoforms = new("CuffData", DB = dbConn, tables = list(mainTable = "isoforms",dataTable = "isoformData",expDiffTable = "isoformExpDiffData",featureTable = "isoformFeatures"), filters = list(),type="isoforms",idField = "isoform_id"),
1532
TSS = new("CuffData", DB = dbConn, tables = list(mainTable = "TSS",dataTable = "TSSData",expDiffTable = "TSSExpDiffData",featureTable = "TSSFeatures"), filters = list(),type = "TSS",idField = "TSS_group_id"),
1533
CDS = new("CuffData", DB = dbConn, tables = list(mainTable = "CDS",dataTable = "CDSData",expDiffTable = "CDSExpDiffData",featureTable = "CDSFeatures"), filters = list(),type = "CDS",idField = "CDS_id"),
1813
#TODO: need to add replicate and count tables here and in AllClasses.R
1815
genes = new("CuffData",DB = dbConn, tables = list(mainTable = "genes",dataTable = "geneData",expDiffTable = "geneExpDiffData",featureTable = "geneFeatures",countTable="geneCount",replicateTable="geneReplicateData"), filters = list(),type = "genes",idField = "gene_id"),
1816
isoforms = new("CuffData", DB = dbConn, tables = list(mainTable = "isoforms",dataTable = "isoformData",expDiffTable = "isoformExpDiffData",featureTable = "isoformFeatures",countTable="isoformCount",replicateTable="isoformReplicateData"), filters = list(),type="isoforms",idField = "isoform_id"),
1817
TSS = new("CuffData", DB = dbConn, tables = list(mainTable = "TSS",dataTable = "TSSData",expDiffTable = "TSSExpDiffData",featureTable = "TSSFeatures",countTable="TSSCount",replicateTable="TSSReplicateData"), filters = list(),type = "TSS",idField = "TSS_group_id"),
1818
CDS = new("CuffData", DB = dbConn, tables = list(mainTable = "CDS",dataTable = "CDSData",expDiffTable = "CDSExpDiffData",featureTable = "CDSFeatures",countTable="CDSCount",replicateTable="CDSReplicateData"), filters = list(),type = "CDS",idField = "CDS_id"),
1534
1819
promoters = new("CuffDist", DB = dbConn, table = "promoterDiffData",type="promoter",idField="gene_id"),
1535
1820
splicing = new("CuffDist", DB = dbConn, table = "splicingDiffData",type="splicing",idField="TSS_group_id"),
1536
1821
relCDS = new("CuffDist", DB = dbConn, table = "CDSDiffData",type="relCDS",idField="gene_id")
1551
1885
if (missing(dbConn))
1552
1886
stop("Must provide a dbConn connection")
1554
gtf<-read.table(gtfFile,sep="\t",header=F)
1557
attributes<-melt(strsplit(as.character(gtf$V9),"; "))
1558
colnames(attributes)<-c("attribute","featureID")
1559
attributes<-paste(attributes$attribute,attributes$featureID)
1560
attributes<-strsplit(as.character(attributes)," ")
1561
attributes<-as.data.frame(do.call("rbind",attributes))
1563
colnames(attributes)<-c("attribute","value","featureID")
1564
attributes<-attributes[,c(3,1,2)]
1566
#Grab only gene_ID and transcript_ID to add to features table
1567
id.attributes<-attributes[attributes$attribute %in% c("gene_id","transcript_id"),]
1568
id.attributes$featureID<-as.numeric(as.character(id.attributes$featureID))
1569
id.attributes<-dcast(id.attributes,...~attribute)
1571
#Main features table
1572
features<-gtf[,c(1:8)]
1573
colnames(features)<-c("seqname","source","type","start","end","score","strand","frame")
1574
features$featureID<-as.numeric(as.character(rownames(features)))
1576
#Merge features and id.attributes
1577
features<-merge(features,id.attributes,by.x='featureID',by.y='featureID')
1578
features<-features[,c(1,10:11,2:9)]
1580
#strip gene_id and transcript_id from attributes
1581
attributes<-attributes[!(attributes$attribute %in% c("gene_id","transcript_id")),]
1583
#Write features table
1584
write("Writing features table",stderr())
1585
#dbWriteTable(dbConn,'geneData',as.data.frame(genemelt[,c(1:2,5,3,4,6)]),row.names=F,append=T)
1586
dbWriteTable(dbConn,'features',as.data.frame(features),append=F)
1588
#Write features table
1589
write("Writing feature attributes table",stderr())
1590
dbWriteTable(dbConn,'attributes',as.data.frame(attributes),append=F)
1888
write("Reading GTF file",stderr())
1889
gr<-import(gtfFile,asRangedData=FALSE)
1890
gr<-as(gr,"data.frame")
1891
#gr$genome<-genomebuild
1892
colnames(gr)[grepl('^transcript_id$',colnames(gr))]<-'isoform_id'
1893
colnames(gr)[grepl('^tss_id$',colnames(gr))]<-'TSS_group_id'
1894
colnames(gr)[grepl('^p_id$',colnames(gr))]<-'CDS_id'
1895
write("Writing GTF features to 'features' table...",stderr())
1896
#dbSendQuery(dbConn,"DROP TABLE IF EXISTS 'features'")
1897
#dbBeginTransaction(dbConn)
1898
dbWriteTable(dbConn,'features',gr,row.names=F,overwrite=T)
1899
#record Genome build
1900
.recordGenome(genomebuild,dbConn)
1905
.recordGenome<-function(genome,dbConn){
1906
genomeInsertQuery<-paste("INSERT INTO runInfo VALUES('genome', '",genome,"')",sep="")
1907
#print(genomeInsertQuery)
1908
dbSendQuery(dbConn,genomeInsertQuery)
1912
#myGeneId<-'XLOC_000071'
1913
#geneQuery<-paste("SELECT start,end,source AS feature,gene_id as gene,exon_number AS exon,transcript_id as transcript,gene_name as symbol, exon_number as rank, strand FROM features WHERE gene_id ='",myGeneId,"'",sep="")
1914
#geneFeatures<-dbGetQuery(cuff@DB,geneQuery)
1915
#geneFeatures$symbol[is.na(geneFeatures$symbol)]<-"NA"
1916
#grtrack<-GeneRegionTrack(geneFeatures,genome="hg19",chromosome="chr1",name="CuffDiff",showId=T,stacking="pack")
1917
#biomTrack<-BiomartGeneRegionTrack(genome="hg19",chromosome="chr1",start=min(start(grtrack)),end=max(end(grtrack)),name="ENSEMBL",showId=T,stacking="pack")
1918
#ideoTrack <- IdeogramTrack(genome = "hg19", chromosome = "chr1")
1919
#axTrack <- GenomeAxisTrack()
1920
#conservation <- UcscTrack(genome = "hg19", chromosome = "chr1",
1921
# track = "Conservation", table = "phyloP46wayAll",
1922
# from = min(start(grtrack)), to = max(end(grtrack)), trackType = "DataTrack",
1923
# start = "start", end = "end", data = "score",
1924
# type = "hist", window = "auto", col.histogram = "darkblue",
1925
# fill.histogram = "darkblue", ylim = c(-3.7, 4),
1926
# name = "Conservation")
1929
#plotTracks(list(ideoTrack,axTrack,grtrack,biomTrack,conservation),from=min(start(grtrack))-1000,to=max(end(grtrack))+1000)
1930
#plotTracks(list(axTrack,grtrack),from=min(start(grtrack))-1000,to=max(end(grtrack))+1000)