Reorganized the codebase beneath top-level public and private directories,
removing the playground and oneoffprojects directories in the process. Updated build.xml accordingly.
This commit is contained in:
parent
b46279d62e
commit
3c9497788e
461
R/ADPRpages.R
461
R/ADPRpages.R
|
|
@ -1,461 +0,0 @@
|
|||
#These functions each make a page for the ADPR. They assume a pdf with the following parameters for best formatting:
|
||||
#pdf(file=paste(sample_sets, ".pdf", sep=""), width=22, height=15, pagecentre=TRUE, pointsize=24)
|
||||
|
||||
|
||||
library(gplots)
|
||||
library(ReadImages)
|
||||
|
||||
##defaults<-par(no.readonly = TRUE)
|
||||
|
||||
|
||||
tearsheet<-function(lanetable, sampletable, variant, Protocol, Sequencer){
|
||||
|
||||
#define layout
|
||||
layout(matrix(c(1,1,2,4,3,5), ncol=2, nrow=3, byrow=TRUE), heights=c(1, 2.5,2.5,), respect=FALSE)
|
||||
|
||||
#prep for title bar
|
||||
title=paste(sample_sets, ": TEAR SHEET", sep="")
|
||||
drop<-read.jpeg("tearsheetdrop.jpg")
|
||||
|
||||
#plot title bar
|
||||
par(mar=c(0,0,0,0))
|
||||
plot(drop)
|
||||
text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
|
||||
|
||||
|
||||
#calc by lane stuff
|
||||
sdlane<-rep("NA", 6)
|
||||
meanlane<-sdlane
|
||||
|
||||
attach(lanetable);
|
||||
|
||||
callable.target<-HS_TARGET_TERRITORY[1];
|
||||
singlelanes<-length(which(Lane.Type=="Single"));
|
||||
pairedlanes<-length(which(Lane.Type=="Paired"));
|
||||
meanlane[1]<-round(mean(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2);
|
||||
sdlane[1]<-round(sd(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2);
|
||||
meanlane[2]<-round(mean(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2);
|
||||
sdlane[2]<-round(sd(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2);
|
||||
meanlane[3]<-round(mean(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
|
||||
sdlane[3]<-round(sd(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
|
||||
meanlane[4]<-round(mean(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
|
||||
meanlane[5]<-round(mean(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
|
||||
meanlane[6]<-round(mean(HS_PCT_TARGET_BASES_30X, na.rm=TRUE));
|
||||
sdlane[4]<-round(sd(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
|
||||
sdlane[5]<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
|
||||
sdlane[6]<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE))
|
||||
|
||||
names<-paste(Flowcell, "-", Lane, sep="")
|
||||
|
||||
detach(lanetable)
|
||||
|
||||
meansamp<-rep("NA", 6)
|
||||
sdsamp<-meansamp
|
||||
|
||||
#Calc by sample metrics
|
||||
attach(bysample);
|
||||
baits<-Bait.Set[1]
|
||||
alllanes<-signif(sum(X..Lanes.included.in.aggregation, na.rm = TRUE))
|
||||
mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE));
|
||||
sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE));
|
||||
mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE));
|
||||
sd.mrl.samp<-signif(sd(Mean.Read.Length, na.rm=TRUE));
|
||||
meansamp[1]<-round(mean(Total.Reads, na.rm=TRUE)/10^6, 2);
|
||||
sdsamp[1]<-round(sd(Total.Reads, na.rm=TRUE)/10^6, 2);
|
||||
meansamp[2]<-round(mean(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2);
|
||||
sdsamp[2]<-round(sd(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2);
|
||||
meansamp[3]<-round(mean(Mean.Target.Coverage..HS., na.rm=TRUE));
|
||||
sdsamp[3]<-round(sd(Mean.Target.Coverage..HS., na.rm=TRUE));
|
||||
meansamp[4]<-round(mean(PCT.Target.Bases.10x..HS., na.rm=TRUE));
|
||||
meansamp[5]<-round(mean(PCT.Target.Bases.20x..HS., na.rm=TRUE));
|
||||
meansamp[6]<-round(mean(PCT.Target.Bases.30x..HS., na.rm=TRUE));
|
||||
sdsamp[4]<-round(sd(PCT.Target.Bases.10x..HS., na.rm=TRUE));
|
||||
sdsamp[5]<-round(sd(PCT.Target.Bases.20x..HS., na.rm=TRUE));
|
||||
sdsamp[6]<-round(sd(PCT.Target.Bases.30x..HS., na.rm=TRUE));
|
||||
|
||||
detach(bysample);
|
||||
|
||||
#calc variant stuff
|
||||
attach(variant)
|
||||
SNPS<-c(ti_count[which(filter_name=="called")]+tv_count[which(filter_name=="called")])
|
||||
titvs<-c(ti.tv_ratio[which(filter_name=="called")])
|
||||
detach(variant)
|
||||
|
||||
#prep stuff.
|
||||
summary<-c(nrow(bysample), Protocol, baits, paste(callable.target, "bases"))
|
||||
summary2<-c(Sequencer, alllanes, paste(mean.lanes.samp, "+/-", sd.lanes.samp), paste(singlelanes, "single lanes,", pairedlanes, "paired lanes"), paste(mean.mrl.samp, "+/-", sd.mrl.samp))
|
||||
samps<-paste(meansamp, c("M", "M", "x", "%", "%", "%"), " +/- ", sdsamp, c("M", "M", "x", "%", "%", "%"), sep="")
|
||||
lanes<-paste(meanlane, c("M", "M", "x", "%", "%", "%"), " +/- ", sdlane, c("M", "M", "x", "%", "%", "%"), sep="")
|
||||
|
||||
#print out 4 tables in R
|
||||
table1<-cbind(summary)
|
||||
rownames(table1)<-c("Samples","Sequencing Protocol", "Bait Design","Callable Target")
|
||||
par(mar=c(4,4,4,4))
|
||||
textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.75)
|
||||
title(main="Project Summary", family="sans", cex.main=2)
|
||||
|
||||
|
||||
table2<-cbind(lanes, samps)
|
||||
colnames(table2)<-c("per lane", "per sample")
|
||||
rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 10x")
|
||||
par(mar=c(4,4,4,4))
|
||||
textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25)
|
||||
title(main="Bases Summary", family="sans", cex.main=1.75)
|
||||
|
||||
table3<-cbind(summary2)
|
||||
rownames(table3)<-c("Sequencer", "Used lanes", "Used lanes per sample", "Lane pariteies", "Read legnths")
|
||||
par(mar=c(4,4,4,4))
|
||||
textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25)
|
||||
title(main="Sequencing Summary", family="sans", cex.main=1.75)
|
||||
|
||||
|
||||
table4<-cbind(SNPS, titvs)
|
||||
rownames(table4)<-c("All SNPs", "Known SNPs", "Novel SNPs")
|
||||
colnames(table4)<-c("SNPs Found", "Ti/Tv")
|
||||
textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25)
|
||||
title(main="Variant Summary", family="sans", cex.main=1.75)
|
||||
|
||||
}
|
||||
|
||||
fingerprints<-function(lanetable, sample_sets){
|
||||
attach(lanetable)
|
||||
|
||||
#define layout
|
||||
layout(matrix(c(1,2,3), ncol=1, nrow=3, byrow=TRUE), heights=c(1, 3,2), respect=FALSE)
|
||||
|
||||
#prep for title bar
|
||||
title=paste(sample_sets, ": Fingerprint Status", sep="")
|
||||
drop<-read.jpeg("adprdrop.jpg")
|
||||
|
||||
#plot title bar
|
||||
par(mar=c(0,0,0,0))
|
||||
plot(drop)
|
||||
text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
|
||||
|
||||
#prep for FP plot
|
||||
badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15))
|
||||
colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS)))
|
||||
colors[badsnps]<-"Red"
|
||||
ticks<-c(match(unique(Flowcell), Flowcell) )
|
||||
ys=rep(c(0, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, 0, 0), ceiling(length(ticks)/2))
|
||||
shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5
|
||||
if((length(ticks)%%2 > 0)){
|
||||
shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5
|
||||
}
|
||||
shader<-na.omit(shader)
|
||||
|
||||
#plot FP plot
|
||||
par(mar=c(10, 6, 8, 3))
|
||||
plot(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=NA, ylim=c(0,24), ylab="Fingerprint calls", xlab="", xaxt="n", col=colors, main="Fingerprint Calling and Matching Sorted by Flowcell", cex.main=2)
|
||||
axis(side=3, at=c(1:length(Flowcell)), labels=Lane[order(Flowcell)], cex.axis=0.5, padj=1,tick=FALSE)
|
||||
axis(side=1, at=c(ticks), labels=sort(unique(Flowcell)), tick=FALSE, las=2)
|
||||
mtext("Lane",side=3, cex=.75, line=1.5)
|
||||
mtext("Flowcell",side=1, cex=1.25, line=8)
|
||||
polygon(shader, ys, border="black", lty=0, col="gray")
|
||||
points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=4, col=colors)
|
||||
points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_CALLS, pch=3, col=colors)
|
||||
if(length(badsnps)>0){
|
||||
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes", "All Confident calls match fingerprint sites"), pch=c(4,3,4,3,8), col=c("Blue", "Blue", "Red", "Red", "Black" ), bg="White")
|
||||
mtext("Some problematic fingerprint sites", side=3)
|
||||
}else{
|
||||
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "All Confident calls match fingerprint sites"), pch=c(4, 3, 8), col="Blue", bg="White")
|
||||
}
|
||||
|
||||
#plot some summary of FP stuff
|
||||
textplot("Some summary of Fingerprint problems will go here ", valign="top", family="sans")
|
||||
|
||||
detach(lanetable)
|
||||
}
|
||||
|
||||
snps_called<-function(lanetable, sample_sets){
|
||||
attach(lanetable)
|
||||
|
||||
#define layout for this page
|
||||
layout(matrix(c(1,1,2, 3, 4,4), ncol=2, nrow=3, byrow=TRUE), widths = c(3,1), heights=c(1, 3,2), respect=FALSE)
|
||||
|
||||
#prep for title bar
|
||||
title=paste(sample_sets, ": SNPs Called by Lane", sep="")
|
||||
drop<-read.jpeg("adprdrop.jpg")
|
||||
|
||||
#plot title bar
|
||||
par(mar=c(0,0,0,0))
|
||||
plot(drop)
|
||||
text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
|
||||
|
||||
#prep for snp plot
|
||||
ticks<-c(match(unique(Flowcell), sort(Flowcell)) )
|
||||
ys=rep(c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, min(SNP_TOTAL_SNPS, na.rm=TRUE), min(SNP_TOTAL_SNPS, na.rm=TRUE)), ceiling(length(ticks)/2))
|
||||
shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5
|
||||
if((length(ticks)%%2 > 0)){
|
||||
shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5
|
||||
}
|
||||
shader<-na.omit(shader)
|
||||
cols<-rep("blue", length(SNP_TOTAL_SNPS))
|
||||
cols[which(SNP_TOTAL_SNPS %in% boxplot.stats(SNP_TOTAL_SNPS)$out)]<-"red"
|
||||
|
||||
#plot snp plot
|
||||
par(ylog=TRUE, mar=c(10, 6, 4, 0))
|
||||
plot(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS[order(Flowcell)],xlab="",
|
||||
ylab="SNPs Called",
|
||||
ylim = c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)),
|
||||
xaxt="n",
|
||||
pch=NA)
|
||||
title(main="SNPs Called in Each Lane sorted by Flowcell", line=3, cex=1.5)
|
||||
axis(side=3, at=c(1:length(Flowcell)), labels=Lane[order(Flowcell)], cex.axis=0.5, padj=1,tick=FALSE)
|
||||
axis(side=1, at=c(ticks), labels=sort(unique(Flowcell)), tick=FALSE, las=2)
|
||||
mtext("Lane",side=3, cex=.75, line=1.5)
|
||||
mtext("Flowcell",side=1, cex=1.25, line=8)
|
||||
polygon(shader, ys, border="black", lty=0, col="gray")
|
||||
points(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS, col=cols, pch=19)
|
||||
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)>0){
|
||||
legend("topright", legend=c("Normal SNP Call Counts", "Outlier SNP Call Counts"), pch=19, col=c("Blue", "red"), bg="White")
|
||||
}
|
||||
|
||||
#plot boxplot
|
||||
par(ylog=TRUE, mar=c(10, 0, 4, 2))
|
||||
boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="", yaxt="n", ylim = c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)), ylog=TRUE)
|
||||
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){
|
||||
mtext("No outliers", side=1, line=4)
|
||||
}else{
|
||||
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
|
||||
}
|
||||
|
||||
#Plot variant summary below
|
||||
textplot("Variant Summary will go here", valign="top", family="sans")
|
||||
|
||||
detach(lanetable)
|
||||
}
|
||||
|
||||
titvsamp<-function(metricsbysamp){
|
||||
attach(titv)
|
||||
|
||||
#define layout
|
||||
layout(matrix(c(1,2,3), ncol=1, nrow=3, byrow=TRUE), heights=c(1, 3,2), respect=FALSE)
|
||||
|
||||
#prep for title bar
|
||||
title=paste(sample_sets, ": Ti/Tv Ratio by Sample", sep="")
|
||||
drop<-read.jpeg("adprdrop.jpg")
|
||||
|
||||
#plot title bar
|
||||
par(mar=c(0,0,0,0))
|
||||
plot(drop)
|
||||
text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
|
||||
|
||||
#prep for titv graph
|
||||
boxplot.stats(TiTvRatio[which(filter_name=="filtered")])$stats[5]->min
|
||||
shade<-which(sort(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE)<min)-.5
|
||||
|
||||
#plot titv graph
|
||||
par(mar=c(9, 5, 4, 2))
|
||||
plot(seq(1:length(unique(row))), sort(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE),
|
||||
xaxt="n",
|
||||
main="Ti/Tv for Novel and Known SNP calls",
|
||||
ylab="Ti/Tv",
|
||||
xlab="",
|
||||
col="red",
|
||||
cex.main=2,
|
||||
cex.lab=1.25,
|
||||
cex.axis=1,
|
||||
pch=1)
|
||||
polygon(c(min(shade), min(shade), max(shade)+5, max(shade)+5, min(shade)), c(par()$xaxp[1:2], par()$xaxp[2:1], par()$xaxp[1]), col="gray", lty=0)
|
||||
points(seq(1:length(unique(row))), c(TiTvRatio[which(novelty_name=="known" & filter_name=="called")])[order(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE)], pch=1, col="blue")
|
||||
axis(side=1, at=c(1:length(unique(row))), labels=unique(row)[order(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE)], tick=FALSE, hadj=1, las=2, cex=1.25)
|
||||
mtext("Samples Sorted by Novel Ti/Tv", side=1, cex=1., line = 6)
|
||||
abline(a=mean(TiTvRatio[which(novelty_name=="all" & filter_name=="called")]),b=0)
|
||||
if(length(shade)<1){
|
||||
legend("topright", legend=c("Known Variants", "Novel Variants", "Mean Ti/Tv for all variants"), col=c("blue", "red", "black"), pch=c(1,1,NA), lty=c(0, 0, 1), xjust=0.5, cex=1.25, adj=c(-20, 0))
|
||||
}else{
|
||||
points(shade+.5, sort(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE)[shade], pch=4, col="red")
|
||||
legend("top", legend=c("Known Variants", "Novel Variants (normal values)", "Novel Variants (low values)","Mean Ti/Tv for all called variants"), col=c("blue", "red", "red", "black"), pch=c(1,1,4,NA), lty=c(0, 0, 0, 1), xjust=0.5, bty="n", cex=1.25, adj=c(0, 0))
|
||||
}
|
||||
|
||||
#Plot variant summary below
|
||||
par(mar=c(2, 2, 2, 2))
|
||||
textplot("Lower TiTv indicates potentially higher false positive rates.\nTi/Tv ratios within the 95% confidence interval of the distribution of Ti/Tv ratios for filtered calls are indicated by gray shading.\nSomething Else will go here too", valign="top", family="sans")
|
||||
|
||||
|
||||
detach(titv)
|
||||
|
||||
}
|
||||
|
||||
#functionalclasses<-function(countfunctclasses){}
|
||||
|
||||
errorratepercycle<-function(erpc){
|
||||
|
||||
#define layout
|
||||
layout(matrix(c(1,2,3), ncol=1, nrow=3, byrow=TRUE), heights=c(1, 3,2), respect=FALSE)
|
||||
|
||||
#prep for title bar
|
||||
title=paste(sample_sets, ": Error Rate Per Cycle", sep="")
|
||||
drop<-read.jpeg("adprdrop.jpg")
|
||||
|
||||
#plot title bar
|
||||
par(mar=c(0,0,0,0))
|
||||
plot(drop)
|
||||
text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
|
||||
|
||||
#prep for erprp graph
|
||||
crazies<-which(errpercycle[nrow(errpercycle),]>0.3) #this can be changed to any kind of filter for particular lanes
|
||||
colors<-rainbow(ncol(errpercycle), s=0.5, v=0.5)
|
||||
colors[crazies]<-rainbow(length(crazies))
|
||||
weights<-rep(1, ncol(errpercycle))
|
||||
weights[crazies]<-2
|
||||
|
||||
#plot erprp graph
|
||||
par(mar=c(6, 6, 3, 2))
|
||||
matplot(errpercycle,
|
||||
type="l",
|
||||
lty="solid",
|
||||
col=colors,
|
||||
lwd=weights,
|
||||
main="Error Rate per Read Position",
|
||||
ylab="Error Rate",
|
||||
xlab="Cycle/Read Position",
|
||||
log="y",
|
||||
cex.main=2,
|
||||
cex.lab=1.5,
|
||||
cex.axis=1.25,
|
||||
)
|
||||
if(length(crazies)>0){
|
||||
legend("topleft", title="Unusual Lanes", legend=colnames(errpercycle)[crazies], lty="solid", lwd=2, col=colors[crazies], xjust=0.5)
|
||||
}else{
|
||||
mtext("No unusual lanes.", 1, line=6, cex=1.25)
|
||||
}
|
||||
|
||||
#Plot variant summary below
|
||||
textplot("Something related will go here", valign="top", family="sans")
|
||||
|
||||
}
|
||||
|
||||
depth_target<-function(DOC){
|
||||
|
||||
#define layout
|
||||
layout(matrix(c(1,2), ncol=1, nrow=2, byrow=TRUE), heights=c(1, 5), respect=FALSE)
|
||||
|
||||
#prep for title bar
|
||||
title=paste(sample_sets, ": Depth of Coverage By Target", sep="")
|
||||
drop<-read.jpeg("adprdrop.jpg")
|
||||
|
||||
#plot title bar
|
||||
par(mar=c(0,0,0,0))
|
||||
plot(drop)
|
||||
text(100, 40, title, family="serif", adj=c(0,0), cex=1.75, col=gray(.25))
|
||||
|
||||
colnames(DOC)->cols
|
||||
apply(DOC[,grep("mean", cols)], 1, median)->medianofmeans
|
||||
apply(DOC[,grep("mean", cols)], 1, quantile, probs=3/4)->q3s
|
||||
apply(DOC[,grep("mean", cols)], 1, quantile, probs=1/4)->q1s
|
||||
|
||||
par(ylog=FALSE, mar=c(5, 5, 4, 2))
|
||||
plot(c(1:3122),sort(medianofmeans, decreasing=TRUE), type="l",log="y",ylab="Coverage", xlab="",xaxt="n", main="Coverage Across All Targets", lwd=2, cex.main=2.5, cex.lab=1.5, cex.axis=1.25)
|
||||
mtext("Targets sorted by median avereage coverage across sample", side=1, line=1, cex=1.5)
|
||||
abline(h=10, lty="dashed", lwd=3)
|
||||
lines(c(1:3122),q3s[order(medianofmeans, decreasing=TRUE)], col="dark blue")
|
||||
lines(c(1:3122),q1s[order(medianofmeans, decreasing=TRUE)], col="dark blue")
|
||||
legend(c(0, 20), legend="10x coverage", box.lty=0, lwd=3, lty="dashed")
|
||||
legend("bottomleft", legend=c("Median average target coverage across all samples", "First and third quartiles of average target across all sample"), box.lty=0, lwd=c(1,2), col=c("black", "dark blue"), lty="solid")
|
||||
|
||||
|
||||
#define layout
|
||||
layout(matrix(c(1,2), ncol=1, nrow=2, byrow=TRUE), heights=c(1,5), respect=FALSE)
|
||||
|
||||
#prep for title bar
|
||||
title=paste(sample_sets, ": Depth of Coverage For Poorly Covered Targets", sep="")
|
||||
drop<-read.jpeg("adprdrop.jpg")
|
||||
|
||||
#plot title bar
|
||||
par(mar=c(0,0,0,0))
|
||||
plot(drop)
|
||||
text(100, 40, title, family="serif", adj=c(0,0), cex=1.25, col=gray(.25))
|
||||
yuck<-DOC[which(medianofmeans<10),grep("mean", cols)]
|
||||
yuck<-yuck+0.1
|
||||
par(mar=c(17, 4, 4, 2))
|
||||
boxplot(t(yuck[order(medianofmeans[which(medianofmeans<10)], decreasing=TRUE),]),log="y", yaxt="n", xaxt="n", cex.lab=1.15, cex.axis=1.05, ylab="Average coverage accross all samples", main="Targets with low coverage accross samples")
|
||||
|
||||
axis(2, at=axTicks(2)+c(0, rep(0.1, length(axTicks(2))-1)), labels=c(0.0, axTicks(2)[2:length(axTicks(2))]), cex.axis=0.75)
|
||||
mtext("Target", side=1, line=15, cex=1.5)
|
||||
axis(1, at=c(1:length(which(medianofmeans<10))), labels=rownames(DOC[which(medianofmeans<10),])[order(medianofmeans[which(medianofmeans<10)])], las=2, cex.axis=1.15)
|
||||
}
|
||||
|
||||
depth_sample<-function(DOC2){
|
||||
|
||||
#define layout
|
||||
layout(matrix(c(1,2), ncol=1, nrow=2, byrow=TRUE), heights=c(1,5), respect=FALSE)
|
||||
|
||||
#prep for title bar
|
||||
title=paste(sample_sets, ": Mean Depth of Coverage per Base by Sample", sep="")
|
||||
drop<-read.jpeg("adprdrop.jpg")
|
||||
|
||||
#plot title bar
|
||||
par(mar=c(0,0,0,0))
|
||||
plot(drop)
|
||||
text(100, 40, title, family="serif", adj=c(0,0), cex=1.25, col=gray(.25))
|
||||
#prep for bysample
|
||||
means<-c(sort(DOC2[which(DOC2[,2]<250),2]), rep(250, (length(which(DOC2[,2]>=250))-1)))
|
||||
types<-rep(20, length(means))
|
||||
cols<-rep("black", length(means))
|
||||
types[which(means==250)]<-8
|
||||
cols[which(means==250)]<-"red"
|
||||
|
||||
#plot doc by sample
|
||||
|
||||
par(mar=c(10, 4, 4, 2))
|
||||
plot(means, ylim=c(0, 250), xaxt="n", col=cols, pch=types, xlab="", ylab="Depth of Coverage")
|
||||
> axis(1, at=c(1:(nrow(DOC2)-1)), labels=c(rownames(DOC2[which(DOC2[,2]<250),])[order(DOC2[which(DOC2[,2]<250),2])], rownames(DOC2[which(DOC2[,2]>=250),])[order(which(DOC2[,2]>=250))][1:(length(which(DOC2[,2]>=250))-1)]), las=2)
|
||||
> mtext("Samples", side=1, line=7, cex=1.25)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
datapuller<-function(setname){
|
||||
#library(yaml)
|
||||
|
||||
strsplit(setname, ".")[1]->projectname
|
||||
|
||||
|
||||
lanes<-read.delim(paste(projectname, "_lanes.txt", sep=""), header=TRUE)
|
||||
samps<-read.delim(paste(projectname, "_samps.txt", sep=""), header=TRUE)
|
||||
#doct<-read.delim(paste(setname, "depth.sample_interval_summary", sep=""), header=TRUE, row.names=1)
|
||||
#docs<-read.delim(paste(setname, ".depth.sample_summary", sep=""), header=TRUE, row.names=1)
|
||||
#eval<-read.csv(paste(setname, "eval.CountFunctionalClasses", sep=""), skip=1)
|
||||
titv<-read.csv(paste(setname, ".eval.SimpleMetricsBySample.csv", sep=""), skip=1)
|
||||
#erprp<-read.delim(paste(setname, ".erprp", sep=""))
|
||||
|
||||
colnames(lanes)<-c('Initiative','Project','GSSR.ID','External.ID','WR.ID','Flowcell','Lane','Lane.Type','Library','AL_TOTAL_READS','AL_PF_READS','AL_PCT_PF_READS','AL_PF_NOISE_READS','AL_PF_READS_ALIGNED','AL_PCT_PF_READS_ALIGNED','AL_PF_HQ_ALIGNED_READS','AL_PF_HQ_ALIGNED_BASES','AL_PF_HQ_ALIGNED_Q20_BASES','AL_PF_HQ_MEDIAN_MISMATCHES','AL_MEAN_READ_LENGTH','AL_READS_ALIGNED_IN_PAIRS','AL_PCT_READS_ALIGNED_IN_PAIRS','AL_BAD_CYCLES','AL_PCT_STRAND_BALANCE','DUP_UNPAIRED_READS_EXAMINED','DUP_READ_PAIRS_EXAMINED','DUP_UNMAPPED_READS','DUP_UNPAIRED_READ_DUPLICATES','DUP_READ_PAIR_DUPLICATES','DUP_PERCENT_DUPLICATION','DUP_ESTIMATED_LIBRARY_SIZE','HS_BAIT_SET','HS_GENOME_SIZE','HS_LIBRARY_SIZE','HS_BAIT_TERRITORY','HS_TARGET_TERRITORY','HS_BAIT_DESIGN_EFFICIENCY','HS_TOTAL_READS','HS_PF_READS','HS_PF_UNIQUE_READS','HS_PCT_PF_READS','HS_PCT_PF_UQ_READS','HS_PCT_PF_UQ_READS_ALIGNED','HS_PF_UQ_READS_ALIGNED','HS_PF_UQ_BASES_ALIGNED','HS_ON_BAIT_BASES','HS_NEAR_BAIT_BASES','HS_OFF_BAIT_BASES','HS_ON_TARGET_BASES','HS_PCT_SELECTED_BASES','HS_PCT_OFF_BAIT','HS_ON_BAIT_VS_SELECTED','HS_MEAN_BAIT_COVERAGE','HS_MEAN_TARGET_COVERAGE','HS_FOLD_ENRICHMENT','HS_ZERO_CVG_TARGETS_PCT','HS_FOLD_80_BASE_PENALTY','HS_PCT_TARGET_BASES_2X','HS_PCT_TARGET_BASES_10X','HS_PCT_TARGET_BASES_20X','HS_PCT_TARGET_BASES_30X','HS_PENALTY_10X','HS_PENALTY_20X','HS_PENALTY_30X','SNP_TOTAL_SNPS','SNP_PCT_DBSNP','SNP_NUM_IN_DBSNP','Lane.IC.Matches','Lane.IC.PCT.Mean.RD1.Err.Rate','Lane.IC.PCT.Mean.RD2.Err.Rate','FP_PANEL_NAME','FP_PANEL_SNPS','FP_CONFIDENT_CALLS','FP_CONFIDENT_MATCHING_SNPS','FP_CONFIDENT_CALLED_PCT','FP_CONFIDENT_MATCHING_SNPS_PCT','LPCNCRD_REFERENCE','LPCNCRD_NON_REFERENCE','LPCNCRD_PCT_CONCORDANCE')
|
||||
|
||||
files<-list(c(lanes, samps, doct, docs, eval, titv, erprp))
|
||||
|
||||
return(files)
|
||||
}
|
||||
|
||||
|
||||
runner<-function(basename, desc1, desc2){
|
||||
datapuller(basename)->tables
|
||||
attach(tables)
|
||||
|
||||
|
||||
|
||||
pdf(paste(basename, ".pdf", sep=""), width=22, height=15,pointsize=24)
|
||||
|
||||
tearsheet(lanes, samps, titv, desc1, desc1)
|
||||
fingerprints(lanes)
|
||||
snps_called(lanes)
|
||||
titvsamp(titv)
|
||||
#functionalclasses(eval)
|
||||
#errorratepercycle(erprp)
|
||||
#depth_target(doct)
|
||||
#depth_sample(docs)
|
||||
|
||||
dev.off()
|
||||
detach(tables)
|
||||
}
|
||||
|
||||
if(length(commandArgs(TRUE))>0){
|
||||
runner(commandArgs(TRUE))
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,325 +0,0 @@
|
|||
#Before executing this file, save squid files as csv, then as tab deliminated files with only the column values as the header, change the format of all cells to numbers. Assign the path to these files to "samples" and "lanes" respectively.
|
||||
#set up database stuff for firehose and picard interface
|
||||
#set up so runnable by firehsoe
|
||||
|
||||
stuffmaker<-function(args){
|
||||
|
||||
lanes<-args[1]
|
||||
samples<-args[2]
|
||||
sample_sets<-args[3]
|
||||
eval<-args[4]
|
||||
titveval<-args[5]
|
||||
DOCi<-args[6]
|
||||
DOCs<-args[7]
|
||||
|
||||
library(gplots)
|
||||
|
||||
pdf(file=paste(sample_sets, ".pdf", sep=""), width=22, height=15, pagecentre=TRUE, pointsize=24)
|
||||
|
||||
|
||||
if(is.na(sample_sets)){
|
||||
print("Please specify sample set for file naming and press enter.")
|
||||
scan("stdin", what="character",n=1)->sample_sets
|
||||
print("Thanks!")
|
||||
}
|
||||
|
||||
if(is.na(lanes) == FALSE && is.na(samples)==FALSE){
|
||||
#this makes a table & graphs using Picard data
|
||||
|
||||
if(typeof(lanes)=="character"){
|
||||
read.delim(file=lanes, header= TRUE)->bylane;
|
||||
colnames(bylane)<-c('Initiative','Project','GSSR.ID','External.ID','WR.ID','Flowcell','Lane','Lane.Type','Library','AL_TOTAL_READS','AL_PF_READS','AL_PCT_PF_READS','AL_PF_NOISE_READS','AL_PF_READS_ALIGNED','AL_PCT_PF_READS_ALIGNED','AL_PF_HQ_ALIGNED_READS','AL_PF_HQ_ALIGNED_BASES','AL_PF_HQ_ALIGNED_Q20_BASES','AL_PF_HQ_MEDIAN_MISMATCHES','AL_MEAN_READ_LENGTH','AL_READS_ALIGNED_IN_PAIRS','AL_PCT_READS_ALIGNED_IN_PAIRS','AL_BAD_CYCLES','AL_PCT_STRAND_BALANCE','DUP_UNPAIRED_READS_EXAMINED','DUP_READ_PAIRS_EXAMINED','DUP_UNMAPPED_READS','DUP_UNPAIRED_READ_DUPLICATES','DUP_READ_PAIR_DUPLICATES','DUP_PERCENT_DUPLICATION','DUP_ESTIMATED_LIBRARY_SIZE','HS_BAIT_SET','HS_GENOME_SIZE','HS_LIBRARY_SIZE','HS_BAIT_TERRITORY','HS_TARGET_TERRITORY','HS_BAIT_DESIGN_EFFICIENCY','HS_TOTAL_READS','HS_PF_READS','HS_PF_UNIQUE_READS','HS_PCT_PF_READS','HS_PCT_PF_UQ_READS','HS_PCT_PF_UQ_READS_ALIGNED','HS_PF_UQ_READS_ALIGNED','HS_PF_UQ_BASES_ALIGNED','HS_ON_BAIT_BASES','HS_NEAR_BAIT_BASES','HS_OFF_BAIT_BASES','HS_ON_TARGET_BASES','HS_PCT_SELECTED_BASES','HS_PCT_OFF_BAIT','HS_ON_BAIT_VS_SELECTED','HS_MEAN_BAIT_COVERAGE','HS_MEAN_TARGET_COVERAGE','HS_FOLD_ENRICHMENT','HS_ZERO_CVG_TARGETS_PCT','HS_FOLD_80_BASE_PENALTY','HS_PCT_TARGET_BASES_2X','HS_PCT_TARGET_BASES_10X','HS_PCT_TARGET_BASES_20X','HS_PCT_TARGET_BASES_30X','HS_PENALTY_10X','HS_PENALTY_20X','HS_PENALTY_30X','SNP_TOTAL_SNPS','SNP_PCT_DBSNP','SNP_NUM_IN_DBSNP','Lane.IC.Matches','Lane.IC.PCT.Mean.RD1.Err.Rate','Lane.IC.PCT.Mean.RD2.Err.Rate','FP_PANEL_NAME','FP_PANEL_SNPS','FP_CONFIDENT_CALLS','FP_CONFIDENT_MATCHING_SNPS','FP_CONFIDENT_CALLED_PCT','FP_CONFIDENT_MATCHING_SNPS_PCT','LPCNCRD_REFERENCE','LPCNCRD_NON_REFERENCE','LPCNCRD_PCT_CONCORDANCE')
|
||||
}else{
|
||||
lanes->bylane
|
||||
colnames(bylane)<-c('Initiative','Project','GSSR.ID','External.ID','WR.ID','Flowcell','Lane','Lane.Type','Library','AL_TOTAL_READS','AL_PF_READS','AL_PCT_PF_READS','AL_PF_NOISE_READS','AL_PF_READS_ALIGNED','AL_PCT_PF_READS_ALIGNED','AL_PF_HQ_ALIGNED_READS','AL_PF_HQ_ALIGNED_BASES','AL_PF_HQ_ALIGNED_Q20_BASES','AL_PF_HQ_MEDIAN_MISMATCHES','AL_MEAN_READ_LENGTH','AL_READS_ALIGNED_IN_PAIRS','AL_PCT_READS_ALIGNED_IN_PAIRS','AL_BAD_CYCLES','AL_PCT_STRAND_BALANCE','DUP_UNPAIRED_READS_EXAMINED','DUP_READ_PAIRS_EXAMINED','DUP_UNMAPPED_READS','DUP_UNPAIRED_READ_DUPLICATES','DUP_READ_PAIR_DUPLICATES','DUP_PERCENT_DUPLICATION','DUP_ESTIMATED_LIBRARY_SIZE','HS_BAIT_SET','HS_GENOME_SIZE','HS_LIBRARY_SIZE','HS_BAIT_TERRITORY','HS_TARGET_TERRITORY','HS_BAIT_DESIGN_EFFICIENCY','HS_TOTAL_READS','HS_PF_READS','HS_PF_UNIQUE_READS','HS_PCT_PF_READS','HS_PCT_PF_UQ_READS','HS_PCT_PF_UQ_READS_ALIGNED','HS_PF_UQ_READS_ALIGNED','HS_PF_UQ_BASES_ALIGNED','HS_ON_BAIT_BASES','HS_NEAR_BAIT_BASES','HS_OFF_BAIT_BASES','HS_ON_TARGET_BASES','HS_PCT_SELECTED_BASES','HS_PCT_OFF_BAIT','HS_ON_BAIT_VS_SELECTED','HS_MEAN_BAIT_COVERAGE','HS_MEAN_TARGET_COVERAGE','HS_FOLD_ENRICHMENT','HS_ZERO_CVG_TARGETS_PCT','HS_FOLD_80_BASE_PENALTY','HS_PCT_TARGET_BASES_2X','HS_PCT_TARGET_BASES_10X','HS_PCT_TARGET_BASES_20X','HS_PCT_TARGET_BASES_30X','HS_PENALTY_10X','HS_PENALTY_20X','HS_PENALTY_30X','SNP_TOTAL_SNPS','SNP_PCT_DBSNP','SNP_NUM_IN_DBSNP','Lane.IC.Matches','Lane.IC.PCT.Mean.RD1.Err.Rate','Lane.IC.PCT.Mean.RD2.Err.Rate','FP_PANEL_NAME','FP_PANEL_SNPS','FP_CONFIDENT_CALLS','FP_CONFIDENT_MATCHING_SNPS','FP_CONFIDENT_CALLED_PCT','FP_CONFIDENT_MATCHING_SNPS_PCT','LPCNCRD_REFERENCE','LPCNCRD_NON_REFERENCE','LPCNCRD_PCT_CONCORDANCE')
|
||||
}
|
||||
if(typeof(samples)=="character"){
|
||||
read.delim(file=samples, header= TRUE)->bysample;
|
||||
}else{
|
||||
samples->bysample
|
||||
}
|
||||
|
||||
#Calc by lane metrics
|
||||
sdlane<-rep("NA", 6)
|
||||
meanlane<-sdlane
|
||||
attach(bylane);
|
||||
|
||||
callable.target<-HS_TARGET_TERRITORY[1];
|
||||
singlelanes<-length(which(Lane.Type=="Single"));
|
||||
pairedlanes<-length(which(Lane.Type=="Paired"));
|
||||
meanlane[1]<-round(mean(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2);
|
||||
sdlane[1]<-round(sd(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2);
|
||||
meanlane[2]<-round(mean(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2);
|
||||
sdlane[2]<-round(sd(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2);
|
||||
meanlane[3]<-round(mean(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
|
||||
sdlane[3]<-round(sd(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
|
||||
meanlane[4]<-round(mean(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
|
||||
meanlane[5]<-round(mean(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
|
||||
meanlane[6]<-round(mean(HS_PCT_TARGET_BASES_30X, na.rm=TRUE));
|
||||
sdlane[4]<-round(sd(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
|
||||
sdlane[5]<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
|
||||
sdlane[6]<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE))
|
||||
|
||||
names<-paste(Flowcell, "-", Lane, sep="")
|
||||
|
||||
#makes a plot of the number of SNPS called per lane
|
||||
|
||||
|
||||
ticks<-c(match(unique(Flowcell), sort(Flowcell)) )
|
||||
ys=rep(c(min(SNP_TOTAL_SNPS, na.rm=TRUE)*0.96, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, min(SNP_TOTAL_SNPS, na.rm=TRUE)*0.96, min(SNP_TOTAL_SNPS, na.rm=TRUE)*0.96), ceiling(length(ticks)/2))
|
||||
|
||||
defaults<-par(no.readonly = TRUE)
|
||||
|
||||
layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE)
|
||||
par(mar=c(10, 6, 4, 8))
|
||||
plot(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS[order(Flowcell)],xlab="", ylab="SNPs Called in Lane", ylim = c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)), xaxt="n", pch=NA)
|
||||
title(main=paste(sample_sets, ": SNPs Called in Each Lane sorted by Flowcell", sep=""), line=3, cex=1.25)
|
||||
axis(side=3, at=c(1:length(Flowcell)), labels=Lane[order(Flowcell)], cex.axis=0.5, padj=1,tick=FALSE)
|
||||
axis(side=1, at=c(ticks), labels=sort(unique(Flowcell)), tick=FALSE, las=2)
|
||||
mtext("Lane",side=3, cex=.75, line=1.5)
|
||||
mtext("Flowcell",cex=.75,side=1, line=8)
|
||||
|
||||
shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5
|
||||
if((length(ticks)%%2 > 0)){
|
||||
shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5
|
||||
}
|
||||
shader<-na.omit(shader)
|
||||
polygon(shader, ys, border="black", lty=0, col="gray")
|
||||
cols<-rep("blue", length(SNP_TOTAL_SNPS))
|
||||
cols[which(SNP_TOTAL_SNPS %in% boxplot.stats(SNP_TOTAL_SNPS)$out)]<-"red"
|
||||
points(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS, col=cols, pch=19)
|
||||
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)>0){
|
||||
legend("topright", legend=c("Normal SNP Call Counts", "Outlier SNP Call Counts"), pch=19, col=c("Blue", "red"), bg="White")
|
||||
}
|
||||
|
||||
boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called" )
|
||||
|
||||
|
||||
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){
|
||||
mtext("No outliers", side=1, line=4)
|
||||
}else{
|
||||
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#makes a plot of fingerprint calls and labels them good or bad
|
||||
par(defaults)
|
||||
|
||||
|
||||
badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15))
|
||||
|
||||
colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS)))
|
||||
colors[badsnps]<-"Red"
|
||||
ticks<-c(match(unique(Flowcell), Flowcell) )
|
||||
ys=rep(c(0, 24*1.04, 24*1.04, 0, 0), ceiling(length(ticks)/2))
|
||||
#pdf(file=paste(sample_sets, "_Fingerprints.pdf", sep=""), width=.2*length(FP_CONFIDENT_CALLS), height=.1*length(FP_CONFIDENT_CALLS))
|
||||
par(mar=c(10, 6, 8, 3))
|
||||
plot(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=NA, ylim=c(0,24), ylab="Fingerprint calls", xlab="", xaxt="n", col=colors, main="Fingerprint Calling and Matching Sorted by lane")
|
||||
axis(side=1, at=(ticks+1), labels=unique(Flowcell), tick=FALSE, hadj=1, las=2)
|
||||
shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5
|
||||
shader<-na.omit(shader)
|
||||
if((length(ticks)%%2 > 0)){
|
||||
shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5
|
||||
}
|
||||
|
||||
polygon(shader, ys, border="black", lty=0, col="gray")
|
||||
points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=4, col=colors)
|
||||
|
||||
points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_CALLS, pch=3, col=colors)
|
||||
|
||||
|
||||
|
||||
|
||||
if(length(badsnps)>0){
|
||||
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes", "All Confident calls match fingerprint sites"), pch=c(4,3,4,3,8), col=c("Blue", "Blue", "Red", "Red", "Black" ), bg="White")
|
||||
mtext("Some problematic fingerprint sites", side=3)
|
||||
}else{
|
||||
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "All Confident calls match fingerprint sites"), pch=c(4, 3, 8), col=c("Blue", "Blue", "Black"), bg="White")
|
||||
}
|
||||
|
||||
|
||||
detach(bylane)
|
||||
|
||||
|
||||
}else{
|
||||
print("Lane and Sample metrics file paths not provided")
|
||||
}
|
||||
meansamp<-rep("NA", 6)
|
||||
sdsamp<-meansamp
|
||||
|
||||
#Calc by sample metrics
|
||||
attach(bysample);
|
||||
mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE));
|
||||
sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE));
|
||||
mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE));
|
||||
sd.mrl.samp<-signif(sd(Mean.Read.Length, na.rm=TRUE));
|
||||
meansamp[1]<-round(mean(Total.Reads, na.rm=TRUE)/10^6, 2);
|
||||
sdsamp[1]<-round(sd(Total.Reads, na.rm=TRUE)/10^6, 2);
|
||||
meansamp[2]<-round(mean(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2);
|
||||
sdsamp[2]<-round(sd(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2);
|
||||
meansamp[3]<-round(mean(Mean.Target.Coverage..HS., na.rm=TRUE));
|
||||
sdsamp[3]<-round(sd(Mean.Target.Coverage..HS., na.rm=TRUE));
|
||||
meansamp[4]<-round(mean(PCT.Target.Bases.10x..HS., na.rm=TRUE));
|
||||
meansamp[5]<-round(mean(PCT.Target.Bases.20x..HS., na.rm=TRUE));
|
||||
meansamp[6]<-round(mean(PCT.Target.Bases.30x..HS., na.rm=TRUE));
|
||||
sdsamp[4]<-round(sd(PCT.Target.Bases.10x..HS., na.rm=TRUE));
|
||||
sdsamp[5]<-round(sd(PCT.Target.Bases.20x..HS., na.rm=TRUE));
|
||||
sdsamp[6]<-round(sd(PCT.Target.Bases.30x..HS., na.rm=TRUE));
|
||||
|
||||
detach(bysample);
|
||||
|
||||
#print all of this stuff out in R.
|
||||
summary<-c(paste(callable.target, "bases"), paste(mean.lanes.samp, "+/-", sd.lanes.samp), paste(singlelanes, "single lanes,", pairedlanes, "paired lanes"), paste(mean.mrl.samp, "+/-", sd.mrl.samp))
|
||||
|
||||
samps<-paste(meansamp, c("M", "M", "x", "%", "%", "%"), " +/- ", sdsamp, c("M", "M", "x", "%", "%", "%"), sep="")
|
||||
|
||||
lanes<-paste(meanlane, c("M", "M", "x", "%", "%", "%"), " +/- ", sdlane, c("M", "M", "x", "%", "%", "%"), sep="")
|
||||
|
||||
layout(matrix(c(1,2), ncol=1), heights=c(2,3))
|
||||
|
||||
table1<-cbind(summary)
|
||||
rownames(table1)<-c("Callable Target", "Used Lanes per Sample", "Parities", "Read Length")
|
||||
textplot(table1, col.rownames="blue", show.colnames=FALSE, cex=1.75)
|
||||
title(main="Sequencing Summary", family="serif", cex.main=2)
|
||||
table2<-cbind(lanes, samps)
|
||||
colnames(table2)<-c("per lane", "per sample")
|
||||
rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 10x")
|
||||
textplot(table2, rmar=1, col.rownames="blue", cex=1.25)
|
||||
title(main="Bases Summary", family="serif", cex.main=1.75)
|
||||
|
||||
|
||||
par(defaults)
|
||||
|
||||
#Makes Error Rate percycle graph
|
||||
if(is.na(eval)==FALSE){
|
||||
if(typeof(eval)=="character"){
|
||||
read.delim(eval, header=TRUE)[2:ncol(read.delim(eval, header=TRUE))]->errpercycle
|
||||
}else{
|
||||
eval->errpercycle
|
||||
}
|
||||
|
||||
|
||||
#pdf(paste(sample_sets, "_errorrate_per_cycle.pdf", sep=""), width=6, height=5)
|
||||
|
||||
crazies<-which(errpercycle[75,]>0.3) #this can be changed to any kind of filter for particular lanes
|
||||
|
||||
colors<-rainbow(ncol(errpercycle), s=0.5, v=0.5)
|
||||
colors[crazies]<-rainbow(length(crazies))
|
||||
weights<-rep(1, ncol(errpercycle))
|
||||
weights[crazies]<-2
|
||||
|
||||
matplot(errpercycle, type="l", lty="solid", col=colors, lwd=weights, main="Error Rate per Cycle", ylab="Error Rate", xlab="Cycle", ylim=c(0, 0.7))
|
||||
|
||||
if(length(crazies)>0){
|
||||
legend("topleft", title="Unusual Lanes", legend=colnames(errpercycle)[crazies], lty="solid", lwd=2, col=colors[crazies], xjust=0.5)
|
||||
}else{
|
||||
legend("topleft", legend="No unusual lanes.", bty="n")
|
||||
}
|
||||
|
||||
|
||||
|
||||
}else{
|
||||
print("Error Rate Per Cycle file paths not provided")
|
||||
}
|
||||
|
||||
#Makes TI/TV known v novel graph
|
||||
if(is.na(titveval)==FALSE){
|
||||
##TODO: need ot make sure this is nice and prettified.
|
||||
titv<-read.csv(file=titveval, skip=1)
|
||||
attach(titv)
|
||||
|
||||
#pdf(file=paste(sample_sets, "_TI-TV.pdf", sep=""), width=0.2*length(unique(sample)), height=0.175*length(unique(sample)))
|
||||
par(mar=c(11, 4, 4, 2))
|
||||
plot(seq(1:length(unique(sample))), Ti.Tv[which(novelty_name=="novel" & filter_name=="called")], xaxt="n", ylim=c(1, 4), main="Ti/Tv for Novel and Known SNP calls", ylab="Ti/Tv", xlab="", col="red", pch=1)
|
||||
|
||||
points(seq(1:length(unique(sample))), Ti.Tv[which(novelty_name=="known" & filter_name=="called")], pch=1, col="blue")
|
||||
|
||||
axis(side=1, at=(1:length(unique(sample))), labels=unique(sample), tick=FALSE, hadj=1, las=2)
|
||||
|
||||
abline(a=mean(Ti.Tv[which(novelty_name=="all" & filter_name=="called")]),b=0)
|
||||
|
||||
legend("bottomright", legend=c("Known Variants", "Novel Variants", "Mean Ti/Tv for all variants"), col=c("blue", "red", "black"), pch=c(1,1,NA_integer_), lty=c(0, 0, 1), xjust=0.5)
|
||||
mtext(line=9,"Lower Ti/Tv ratios indicate potentially increased false positive SNP rates.", side=1)
|
||||
|
||||
|
||||
}else{
|
||||
print("TiTV filepath not provided")
|
||||
}
|
||||
|
||||
#Make DOC graph
|
||||
if(is.na(DOCi)==FALSE){
|
||||
#pdf(paste(sample_set, "_DOCi.pdf", sep=""), width=6, height=5)
|
||||
if(typeof(DOCi)=="character"){
|
||||
as.data.frame(read.delim(DOCi))->DOC
|
||||
}else{
|
||||
DOCi->DOCdata
|
||||
}
|
||||
|
||||
colnames(DOC)->cols
|
||||
apply(DOC[,grep("mean", cols)], 1, median)->medianofmeans
|
||||
apply(DOC[,grep("mean", cols)], 1, quantile, probs=3/4)->q3s
|
||||
apply(DOC[,grep("mean", cols)], 1, quantile, probs=1/4)->q1s
|
||||
|
||||
par(ylog=FALSE, mar=c(5, 4, 4, 2))
|
||||
plot(c(1:3122),sort(medianofmeans, decreasing=TRUE), type="l", lwd="1",log="y",ylab="Coverage", xlab="Targets sorted by median average coverage across sample",xaxt="n", main="Coverage Across All Targets")
|
||||
|
||||
abline(h=10, lty="dotted")
|
||||
|
||||
lines(c(1:3122),q3s[order(medianofmeans, decreasing=TRUE)])
|
||||
|
||||
lines(c(1:3122),q1s[order(medianofmeans, decreasing=TRUE)])
|
||||
|
||||
legend("bottomleft", "10x coverage", box.lty=0, lty="dotted")
|
||||
|
||||
|
||||
|
||||
#pdf(paste(sample_set, "_DOCiy.pdf", sep=""), width=6, height=5)
|
||||
yuck<-DOC[which(medianofmeans<10),grep("mean", cols)]
|
||||
yuck<-yuck+0.1
|
||||
par(mar=c(16, 4, 4, 2))
|
||||
boxplot(t(yuck[order(medianofmeans[which(medianofmeans<10)], decreasing=TRUE),]),log="y", yaxt="n", xaxt="n", ylab="Average coverage accross all samples", main="Targets with low coverage accross samples")
|
||||
|
||||
axis(2, at=axTicks(2)+c(0, rep(0.1, length(axTicks(2))-1)), labels=c(0.0, axTicks(2)[2:length(axTicks(2))]), cex.axis=0.75)
|
||||
mtext("Target", side=1, line=14)
|
||||
axis(1, at=c(1:length(which(medianofmeans<10))), labels=DOC[which(medianofmeans<10),1][order(medianofmeans[which(medianofmeans<10)])], las=2, cex.axis=0.75)
|
||||
|
||||
|
||||
|
||||
|
||||
}else{
|
||||
print("Depth of Coverage--intervals filepath not provided")
|
||||
}
|
||||
|
||||
if(is.na(DOCs)==FALSE){
|
||||
#pdf(paste(sample_set, "_DOCs.pdf", sep=""), width=6, height=5)
|
||||
if(typeof(DOCs)=="character"){
|
||||
as.data.frame(read.delim(DOCs))->DOC2
|
||||
}else{
|
||||
DOCs->DOCdata
|
||||
}
|
||||
par(mar=c(10, 4, 4, 2))
|
||||
boxplot(t(DOC2[,2:ncol(DOC2)]+0.1), log="y", main="Depth of Coverage by Sample", xaxt="n", yaxt="n", ylab="Coverage")
|
||||
|
||||
axis(1, at=c(1:nrow(DOC2)), labels=DOC2[,1], las=2)
|
||||
|
||||
axis(2, at=axTicks(2)+c(0, rep(0.1, length(axTicks(2))-1)), labels=floor(c(0.0, axTicks(2)[2:length(axTicks(2))])))
|
||||
|
||||
labels=floor(c(0.0, axTicks(2)[2:length(axTicks(2))]))
|
||||
|
||||
mtext("Samples", side=1, line=9)
|
||||
|
||||
|
||||
|
||||
}else{
|
||||
print("Depth of Coverage--samples filepath not provided")
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
||||
}
|
||||
if(length(commandArgs(TRUE))>0){
|
||||
stuffmaker(commandArgs(TRUE))
|
||||
}
|
||||
|
|
@ -1,366 +0,0 @@
|
|||
##put titles/rownames left
|
||||
##make titles blue
|
||||
##decrease margins below titles
|
||||
## put row names in black
|
||||
##put background rows in.
|
||||
##change layouts so that it looks better
|
||||
##get sample numbers in correctly
|
||||
|
||||
.libPaths('/humgen/gsa-firehose2/pipeline/repositories/StingProduction/R/')
|
||||
|
||||
suppressMessages(library(gplots));
|
||||
suppressMessages(library(ReadImages));
|
||||
|
||||
suppressMessages(library(gsalib));
|
||||
suppressMessages(library(ROracle));
|
||||
|
||||
cmdargs = gsa.getargs(
|
||||
list(
|
||||
yaml = list(value=NA, doc="pipeline YAML file"),
|
||||
bamlist = list(value=NA, doc="list of BAM files"),
|
||||
evalroot = list(value=NA, doc="VariantEval file"),
|
||||
tearout = list(value=NA, doc="Output path for tearsheet PDF")#,
|
||||
plotout = list(value=NA, doc="Output path for PDF")
|
||||
),
|
||||
doc="Creates a tearsheet"
|
||||
);
|
||||
|
||||
|
||||
bamlist = scan(cmdargs$bamlist, "character");
|
||||
squids <- system(paste("grep SQUID ", cmdargs$yaml, ' |grep "C..." -o', sep=""), intern=TRUE)
|
||||
indexed = c();
|
||||
nonindexed = c();
|
||||
for (bam in bamlist) {
|
||||
bamheader = system(paste("samtools view -H", bam), intern=TRUE);
|
||||
|
||||
|
||||
if (length(bamheader) > 0) {
|
||||
rgs = bamheader[grep("^@RG", bamheader)];
|
||||
|
||||
for (rg in rgs) {
|
||||
id = grep("PU:", unlist(strsplit(rg, "\t")), value=TRUE);
|
||||
id = sub("PU:", "", id);
|
||||
id = gsub("XX......", "XX", id)
|
||||
if (length(unlist(strsplit(id, "\\.")))==3){
|
||||
indexed<-c(indexed, id)
|
||||
}
|
||||
else{
|
||||
if(length(unlist(strsplit(id, "\\.")))==2){
|
||||
nonindexed<-c(nonindexed, id)
|
||||
}
|
||||
else{
|
||||
print(id + " is a strange PU and will result in odd searches")
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
print(sprintf("Could not load '%s'\n", bam));
|
||||
}
|
||||
}
|
||||
|
||||
drv = dbDriver("Oracle");
|
||||
con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD");
|
||||
|
||||
rs = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS"));
|
||||
d = fetch(rs, n=-1);
|
||||
dbHasCompleted(rs);
|
||||
dbClearResult(rs);
|
||||
|
||||
rs2 = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG"));
|
||||
d2 = fetch(rs2, n=-1);
|
||||
dbHasCompleted(rs2);
|
||||
dbClearResult(rs2);
|
||||
|
||||
oraCloseDriver(drv);
|
||||
|
||||
squid_fclanes = sprintf("%s.%s", d$"Flowcell", d$"Lane");
|
||||
squid_fclanes_indexed = sprintf("%s.%s.%s", d$"Flowcell", d$"Lane", d$"Barcode");
|
||||
|
||||
|
||||
dproj = d[which(squid_fclanes %in% nonindexed),];
|
||||
dproj = rbind(dproj, d[which(squid_fclanes_indexed %in% indexed),])
|
||||
|
||||
dproj = dproj[which(dproj$"Project" %in% unique(squids)),]
|
||||
|
||||
d2proj = d2[which(d2$"Project" %in% unique(dproj$Project) & d2$"Sample" %in% dproj$"External ID"),];
|
||||
|
||||
|
||||
|
||||
tearsheet<-function(){
|
||||
tearsheetdrop <- "~Documents/Sting/R/gsalib/data/tearsheetdrop.jpg" #put the path to the tearsheet backdrop here
|
||||
|
||||
pdf(file= cmdargs$tearout, width=22, height=17, pagecentre=TRUE, pointsize=24)
|
||||
|
||||
#define layout
|
||||
postable<-matrix(c(1, 1, 1, 1, 1, 1, rep(c(2, 2, 2, 4, 4, 4), 5), rep(c(3, 3, 3, 4, 4, 4), 3), rep(c(3,3,3,5,5,5), 5), 6,6,6,7,7,7), nrow=15, ncol=6, byrow=TRUE)
|
||||
layout(postable, heights=c(1, rep(.18, 13), 2), respect=FALSE)
|
||||
|
||||
|
||||
#prep for title bar
|
||||
drop<-read.jpeg(system.file(tearsheetdrop, package="gsalib"))
|
||||
|
||||
#plot title bar
|
||||
par(mar=c(0,0,0,0))
|
||||
plot(drop)
|
||||
text(155, 50, "testing", family="serif", adj=c(0,0), cex=3, col=gray(.25))
|
||||
|
||||
|
||||
# Project summary
|
||||
projects = paste(unique(dproj$"Project"), collapse=", ");
|
||||
|
||||
used_samples = length(bamlist);
|
||||
|
||||
unused_samples = 0;
|
||||
|
||||
sequencing_protocol = "Hybrid selection"; #can this be extracted?
|
||||
|
||||
bait_design = paste(dimnames(table(dproj$"Bait Set"))[[1]][order(table(dproj$"Bait Set"), decreasing=TRUE)], collapse=", ");
|
||||
|
||||
if(nchar(bait_design)>50){
|
||||
bait_design<-strsplit(bait_design, ", ")[[1]][1]
|
||||
}
|
||||
|
||||
if(nchar(bait_design)>50){
|
||||
bait_design<-strsplit(bait_design, ".Homo")[[1]][1]
|
||||
}
|
||||
|
||||
callable_target = paste(na.omit(unique(dproj$"Target Territory")), collapse=", ");
|
||||
|
||||
table1<-rbind(paste(used_samples," used samples/", unused_samples + used_samples," total samples", sep=""), sequencing_protocol, bait_design, callable_target)
|
||||
rownames(table1)<-c("Samples","Sequencing Protocol", "Bait Design","Callable Target")
|
||||
par(mar=c(0,0,1,0))
|
||||
textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.25, valign="top")
|
||||
title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.25, line=-1)
|
||||
|
||||
# Bases summary
|
||||
|
||||
reads_per_lane_mean = format(mean(dproj$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
reads_per_lane_sd = format(sd(dproj$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
lanes<-sprintf("%s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd)
|
||||
|
||||
used_bases_per_lane_mean = format(mean(dproj$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE);
|
||||
used_bases_per_lane_sd = format(sd(dproj$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
lanes<-c(lanes, sprintf("%s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd));
|
||||
|
||||
target_coverage_mean = mean(na.omit(dproj$"Mean Target Coverage"));
|
||||
target_coverage_sd = sd(na.omit(dproj$"Mean Target Coverage"));
|
||||
lanes<-c(lanes, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd));
|
||||
|
||||
pct_loci_gt_10x_mean = mean(na.omit(dproj$"Target Bases 10x %"));
|
||||
pct_loci_gt_10x_sd = sd(na.omit(dproj$"Target Bases 10x %"));
|
||||
lanes<-c(lanes, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd));
|
||||
|
||||
pct_loci_gt_20x_mean = mean(na.omit(dproj$"Target Bases 20x %"));
|
||||
pct_loci_gt_20x_sd = sd(na.omit(dproj$"Target Bases 20x %"));
|
||||
lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd));
|
||||
|
||||
pct_loci_gt_30x_mean = mean(na.omit(dproj$"Target Bases 30x %"));
|
||||
pct_loci_gt_30x_sd = sd(na.omit(dproj$"Target Bases 30x %"));
|
||||
lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));
|
||||
|
||||
|
||||
reads_per_sample_mean = format(mean(d2proj$"PF Reads", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
reads_per_sample_sd = format(sd(d2proj$"PF Reads",na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
samps<-sprintf("%s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd);
|
||||
|
||||
used_bases_per_sample_mean = format(mean(d2proj$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE);
|
||||
used_bases_per_sample_sd = format(sd(d2proj$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
samps<-c(samps, sprintf("%s +/- %s\n", used_bases_per_sample_mean, used_bases_per_sample_sd));
|
||||
|
||||
target_coverage_mean = mean(na.omit(d2proj$"Mean Target Coverage"));
|
||||
target_coverage_sd = sd(na.omit(d2proj$"Mean Target Coverage"));
|
||||
samps<-c(samps, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd));
|
||||
|
||||
pct_loci_gt_10x_mean = mean(na.omit(d2proj$"Target Bases 10x %"));
|
||||
pct_loci_gt_10x_sd = sd(na.omit(d2proj$"Target Bases 10x %"));
|
||||
samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd));
|
||||
|
||||
pct_loci_gt_20x_mean = mean(na.omit(d2proj$"Target Bases 20x %"));
|
||||
pct_loci_gt_20x_sd = sd(na.omit(d2proj$"Target Bases 20x %"));
|
||||
samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd));
|
||||
|
||||
pct_loci_gt_30x_mean = mean(na.omit(d2proj$"Target Bases 30x %"));
|
||||
pct_loci_gt_30x_sd = sd(na.omit(d2proj$"Target Bases 30x %"));
|
||||
samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));
|
||||
|
||||
table2<-cbind(lanes, samps)
|
||||
colnames(table2)<-c("Per lane", "Per sample")
|
||||
|
||||
rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 30x")
|
||||
par(mar=c(0,0,1,0))
|
||||
textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25, valign="top")
|
||||
title(main="Bases Summary", family="sans", cex.main=1.25, line=0)
|
||||
|
||||
|
||||
# Sequencing summary
|
||||
|
||||
instrument <- c();
|
||||
if(length(grep("AAXX", dproj$Flowcell))>0){
|
||||
instrument <- c(instrument, "Illumina GA2")
|
||||
}
|
||||
if(length(grep("ABXX", dproj$Flowcell))>0){
|
||||
instrument <- c(instrument, "Illumina HiSeq")
|
||||
}
|
||||
|
||||
if(length(instrument)>1){
|
||||
instrument<-paste(instrument[1], instrument[2], sep=" and ")
|
||||
}
|
||||
|
||||
used_lanes = nrow(dproj);
|
||||
unused_lanes_by_sequencing = 0; #can we get this?
|
||||
unused_lanes_by_analysis = 0;
|
||||
|
||||
|
||||
lanes_per_sample_mean = mean(table(dproj$"External ID"), na.rm=TRUE);
|
||||
lanes_per_sample_sd = sd(table(dproj$"External ID"), na.rm=TRUE);
|
||||
lanes_per_sample_median = median(table(dproj$"External ID"));
|
||||
lanes_paired = nrow(subset(dproj, dproj$"Lane Type" == "Paired"));
|
||||
lanes_widowed = nrow(subset(dproj, dproj$"Lane Type" == "Widowed"));
|
||||
lanes_single = nrow(subset(dproj, dproj$"Lane Type" == "Single"));
|
||||
|
||||
read_length_mean = mean(dproj$"Mean Read Length (P)");
|
||||
read_length_sd = sd(dproj$"Mean Read Length (P)");
|
||||
read_length_median = median(dproj$"Mean Read Length (P)");
|
||||
|
||||
date = dproj$"Run Date";
|
||||
# date = sub("JAN", "01", date);
|
||||
# date = sub("FEB", "02", date);
|
||||
# date = sub("MAR", "03", date);
|
||||
# date = sub("APR", "04", date);
|
||||
# date = sub("MAY", "05", date);
|
||||
# date = sub("JUN", "06", date);
|
||||
# date = sub("JUL", "07", date);
|
||||
# date = sub("AUG", "08", date);
|
||||
# date = sub("SEP", "09", date);
|
||||
# date = sub("OCT", "10", date);
|
||||
# date = sub("NOV", "11", date);
|
||||
# date = sub("DEC", "12", date);
|
||||
date = date[order(as.Date(date, format="%d-%m-%Y"))];
|
||||
|
||||
start_date = date[1];
|
||||
end_date = date[length(date)];
|
||||
|
||||
|
||||
table3<-rbind(paste(instrument), used_lanes, sprintf("%s rejected by sequencing, %s by analysis\n", unused_lanes_by_sequencing, unused_lanes_by_analysis), sprintf("%0.1f +/- %0.1f lanes (median=%0.1f)\n", lanes_per_sample_mean, lanes_per_sample_sd, lanes_per_sample_median), sprintf("%s paired, %s widowed, %s single\n", lanes_paired, lanes_widowed, lanes_single), sprintf("%0.1f +/- %0.1f bases (median=%0.1f)\n", read_length_mean, read_length_sd, read_length_median), sprintf("\tSequencing dates: %s to %s\n", start_date, end_date))
|
||||
|
||||
|
||||
rownames(table3)<-c("Sequencer", "Used lanes", "Unused lanes","Used lanes/sample", "Lane parities", "Read lengths", "Sequencing dates")
|
||||
par(mar=c(0,0,1,0))
|
||||
textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25, valign="top")
|
||||
title(main="Sequencing Summary", family="sans", cex.main=1.25, line=0)
|
||||
|
||||
eval = gsa.read.gatkreport(cmdargs$evalroot)
|
||||
|
||||
|
||||
# Variant summary
|
||||
##TODO: Fix this csv reader
|
||||
eval.counts = eval$CountVariants
|
||||
eval.counts.all = subset(eval.counts, Novelty == "all")$nVariantLoci;
|
||||
eval.counts.known = subset(eval.counts, Novelty == "known")$nVariantLoci;
|
||||
eval.counts.novel = subset(eval.counts, Novelty == "novel")$nVariantLoci;
|
||||
|
||||
eval.titv = eval$TiTvVariantEvaluator
|
||||
eval.titv.all = subset(eval.titv, Novelty == "all")$tiTvRatio;
|
||||
eval.titv.known = subset(eval.titv, Novelty == "known")$tiTvRatio;
|
||||
eval.titv.novel = subset(eval.titv, Novelty == "novel")$tiTvRatio;
|
||||
|
||||
table4 = matrix(c(eval.counts.all, eval.counts.known, eval.counts.novel, eval.titv.all, eval.titv.known, eval.titv.novel, "3.0 - 3.2", "3.2 - 3.4", "2.7 - 3.0"), nrow=3);
|
||||
|
||||
rownames(table4) = c("All", "Known", "Novel");
|
||||
colnames(table4) = c("Found", "Ti/Tv ratio", "Expected Ti/Tv ratio");
|
||||
|
||||
|
||||
|
||||
par(mar=c(0,0,0,0))
|
||||
textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25, valign="top")
|
||||
title(main="Variant Summary", family="sans", cex.main=1.25, line=-2)
|
||||
#
|
||||
# #plots
|
||||
# #fix this reader
|
||||
# eval.bysample = read.csv(paste(cmdargs$evalroot, ".SimpleMetricsBySample.csv", sep=""), header=TRUE, comment.char="#");
|
||||
# eval.bysample.called = subset(eval.bysample, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
|
||||
# eval.bysample.all = subset(eval.bysample.called, novelty_name == "all");
|
||||
# eval.bysample.known = subset(eval.bysample.called, novelty_name == "known");
|
||||
# eval.bysample.novel = subset(eval.bysample.called, novelty_name == "novel");
|
||||
|
||||
eval.ac = eval$SimpleMetricsByAC.metrics
|
||||
eval.ac.all = subset(eval.ac, Novelty == "all");
|
||||
eval.ac.known = subset(eval.ac, Novelty == "known");
|
||||
eval.ac.novel = subset(eval.ac, Novelty == "novel");
|
||||
#
|
||||
# eval.func = read.csv(paste(cmdargs$evalroot, ".Functional_Class_Counts_by_Sample.csv", sep=""), header=TRUE, comment.char="#");
|
||||
# eval.func.called = subset(eval.func, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
|
||||
# eval.func.all = subset(eval.func.called, novelty_name == "all");
|
||||
# eval.func.known = subset(eval.func.called, novelty_name == "known");
|
||||
# eval.func.novel = subset(eval.func.called, novelty_name == "novel");
|
||||
|
||||
|
||||
#boxplot(eval.bysample.all$CountVariants, eval.bysample.known$CountVariants, eval.bysample.novel$CountVariants, names=c("All", "Known", "Novel"), ylab="Variants per sample", main="", cex=1.3, cex.lab=1.3, cex.axis=1.3);
|
||||
|
||||
# par(mar=c(5, 4, 4, 2) + 0.1)
|
||||
# ind = order(eval.bysample.all$CountVariants);
|
||||
# plot(c(1:length(eval.bysample.all$CountVariants)), eval.bysample.all$CountVariants[ind], col="black", cex=1.1, cex.lab=1.1, cex.axis=1.1, main="Variants per Sample", xlab="Sample", ylab="Number of variants", bty="n", ylim=c(0, max(eval.bysample.all$CountVariants)));
|
||||
# points(c(1:length(eval.bysample.known$CountVariants)), eval.bysample.known$CountVariants[ind], col="blue", cex=1.3);
|
||||
# points(c(1:length(eval.bysample.novel$CountVariants)), eval.bysample.novel$CountVariants[ind], col="red", cex=1.3);
|
||||
# legend("right", max(eval.bysample.all$CountVariants)/2, c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21);
|
||||
|
||||
par(mar=c(5, 4, 4, 2) + 0.1)
|
||||
plot(eval.ac.all$AC, eval.ac.all$n, col="black", type="l", lwd=2, cex=1.1, cex.lab=1.1, cex.axis=1.1, xlab="Allele count", ylab="Number of variants", main="Variants by Allele Count", log="xy", bty="n");
|
||||
points(eval.ac.known$AC, eval.ac.known$n, col="blue", type="l", lwd=2);
|
||||
points(eval.ac.novel$AC, eval.ac.novel$n, col="red", type="l", lwd=2);
|
||||
legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2);
|
||||
|
||||
#plot(eval.func.all$Synonymous[ind] / (eval.func.all$Missense + eval.func.all$Nonsense)[ind], ylim=c(0, 2), cex=1.3, cex.lab=1.3, cex.axis=1.3, bty="n", xlab="Sample", ylab="Ratio of synonymous to non-synonymous variants", col="black");
|
||||
#points(eval.func.known$Synonymous[ind] / (eval.func.known$Missense + eval.func.known$Nonsense)[ind], cex=1.3, col="blue");
|
||||
#points(eval.func.novel$Synonymous[ind] / (eval.func.novel$Missense + eval.func.novel$Nonsense)[ind], cex=1.3, col="red");
|
||||
#legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21);
|
||||
|
||||
|
||||
|
||||
dev.off()
|
||||
}
|
||||
|
||||
tearsheet()
|
||||
|
||||
# Plots
|
||||
plots<-function(){
|
||||
# eval.bysample = read.csv(paste(cmdargs$evalroot, ".SimpleMetricsBySample.csv", sep=""), header=TRUE, comment.char="#");
|
||||
# eval.bysample.called = subset(eval.bysample, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
|
||||
# eval.bysample.all = subset(eval.bysample.called, novelty_name == "all");
|
||||
# eval.bysample.known = subset(eval.bysample.called, novelty_name == "known");
|
||||
# eval.bysample.novel = subset(eval.bysample.called, novelty_name == "novel");
|
||||
|
||||
|
||||
eval.ac = eval$SimpleMetricsByAC.metrics
|
||||
eval.ac.all = subset(eval.ac.called, Novelty == "all");
|
||||
eval.ac.known = subset(eval.ac.called, Novelty == "known");
|
||||
eval.ac.novel = subset(eval.ac.called, Novelty == "novel");
|
||||
#
|
||||
# eval.func = read.csv(paste(cmdargs$evalroot, ".Functional_Class_Counts_by_Sample.csv", sep=""), header=TRUE, comment.char="#");
|
||||
# eval.func.called = subset(eval.func, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
|
||||
# eval.func.all = subset(eval.func.called, novelty_name == "all");
|
||||
# eval.func.known = subset(eval.func.called, novelty_name == "known");
|
||||
# eval.func.novel = subset(eval.func.called, novelty_name == "novel");
|
||||
|
||||
pdf(file= cmdargs$plotout, width=22, height=17, pagecentre=TRUE, pointsize=24)
|
||||
#
|
||||
# boxplot(eval.bysample.all$CountVariants, eval.bysample.known$CountVariants, eval.bysample.novel$CountVariants, names=c("All", "Known", "Novel"), ylab="Variants per sample", main="", cex=1.3, cex.lab=1.3, cex.axis=1.3);
|
||||
#
|
||||
# ind = order(eval.bysample.all$CountVariants);
|
||||
# plot(c(1:length(eval.bysample.all$CountVariants)), eval.bysample.all$CountVariants[ind], col="black", cex=1.3, cex.lab=1.3, cex.axis=1.3, xlab="Sample", ylab="Number of variants", bty="n", ylim=c(0, max(eval.bysample.all$CountVariants)));
|
||||
# points(c(1:length(eval.bysample.known$CountVariants)), eval.bysample.known$CountVariants[ind], col="blue", cex=1.3);
|
||||
# points(c(1:length(eval.bysample.novel$CountVariants)), eval.bysample.novel$CountVariants[ind], col="red", cex=1.3);
|
||||
# legend(0, max(eval.bysample.all$CountVariants)/2, c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21);
|
||||
|
||||
plot(eval.ac.all$AC, eval.ac.all$n, col="black", type="l", lwd=2, cex=1.3, cex.lab=1.3, cex.axis=1.3, xlab="Allele count", ylab="Number of variants", main="", log="xy", bty="n");
|
||||
points(eval.ac.known$AC, eval.ac.known$n, col="blue", type="l", lwd=2);
|
||||
points(eval.ac.novel$AC, eval.ac.novel$n, col="red", type="l", lwd=2);
|
||||
legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2);
|
||||
#
|
||||
# plot(eval.func.all$Synonymous[ind] / (eval.func.all$Missense + eval.func.all$Nonsense)[ind], ylim=c(0, 2), cex=1.3, cex.lab=1.3, cex.axis=1.3, bty="n", xlab="Sample", ylab="Ratio of synonymous to non-synonymous variants", col="black");
|
||||
# points(eval.func.known$Synonymous[ind] / (eval.func.known$Missense + eval.func.known$Nonsense)[ind], cex=1.3, col="blue");
|
||||
# points(eval.func.novel$Synonymous[ind] / (eval.func.novel$Missense + eval.func.novel$Nonsense)[ind], cex=1.3, col="red");
|
||||
# legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21);
|
||||
|
||||
dev.off();
|
||||
}
|
||||
|
|
@ -1,266 +0,0 @@
|
|||
#New tearsheet generator
|
||||
.libPaths('/humgen/gsa-pipeline/.repository/R/')
|
||||
|
||||
suppressMessages(library(gplots));
|
||||
suppressMessages(library(ReadImages));
|
||||
suppressMessages(library(gsalib));
|
||||
|
||||
tearsheet<-function(){
|
||||
|
||||
def.par <- par(no.readonly = TRUE)
|
||||
|
||||
#define layout
|
||||
postable<-matrix(c(1, 1, 1, 1, rep(c(2, 2, 4, 4), 5), rep(c(3, 3, 4, 4), 3), rep(c(3,3,5,5), 5), 6,7,8,9), nrow=15, ncol=4, byrow=TRUE)
|
||||
layout(postable, heights=c(1, rep(.18, 13), 2), respect=FALSE)
|
||||
|
||||
#prep for title bar
|
||||
drop<-read.jpeg(system.file("data", "tearsheetdrop.jpg", package="gsalib"))
|
||||
|
||||
|
||||
#plot title bar
|
||||
par(mar=c(0,0,0,0))
|
||||
plot(drop)
|
||||
text(155, 50, cmdargs$title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
|
||||
print("Title created...")
|
||||
|
||||
|
||||
# Project summary
|
||||
projects = paste(squids, collapse=", ");
|
||||
|
||||
used_samples = nrow(settable);
|
||||
|
||||
unused_samples = 0;
|
||||
|
||||
sequencing_protocol = samp$Initiative[1]
|
||||
|
||||
bait_design = samp$"Bait Set"[1]
|
||||
|
||||
callable_target = samp$"Target Territory"[1]
|
||||
|
||||
table1<-rbind(paste(used_samples," used samples/", unused_samples + used_samples," total samples", sep=""), sequencing_protocol, bait_design, callable_target)
|
||||
rownames(table1)<-c("Samples","Sequencing Initiative", "Bait Design","Callable Target")
|
||||
par(mar=c(0,0,1,0))
|
||||
textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.25, valign="top")
|
||||
title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.25, line=-1)
|
||||
print("Project summary created...")
|
||||
# Bases summary
|
||||
|
||||
reads_per_lane_mean = format(mean(lane$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
reads_per_lane_sd = format(sd(lane$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
lanessum<-sprintf("%s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd)
|
||||
|
||||
used_bases_per_lane_mean = format(mean(lane$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE);
|
||||
used_bases_per_lane_sd = format(sd(lane$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
lanessum<-c(lanessum, sprintf("%s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd));
|
||||
|
||||
target_coverage_mean = mean(na.omit(lane$"Mean Target Coverage"));
|
||||
target_coverage_sd = sd(na.omit(lane$"Mean Target Coverage"));
|
||||
lanessum<-c(lanessum, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd));
|
||||
|
||||
pct_loci_gt_10x_mean = mean(na.omit(lane$"Target Bases 10x %"));
|
||||
pct_loci_gt_10x_sd = sd(na.omit(lane$"Target Bases 10x %"));
|
||||
lanessum<-c(lanessum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd));
|
||||
|
||||
pct_loci_gt_20x_mean = mean(na.omit(lane$"Target Bases 20x %"));
|
||||
pct_loci_gt_20x_sd = sd(na.omit(lane$"Target Bases 20x %"));
|
||||
lanessum<-c(lanessum,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd));
|
||||
|
||||
pct_loci_gt_30x_mean = mean(na.omit(lane$"Target Bases 30x %"));
|
||||
pct_loci_gt_30x_sd = sd(na.omit(lane$"Target Bases 30x %"));
|
||||
lanessum<-c(lanessum,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));
|
||||
|
||||
|
||||
reads_per_sample_mean = format(mean(samp$"PF Reads", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
reads_per_sample_sd = format(sd(samp$"PF Reads",na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
sampssum<-sprintf("%s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd);
|
||||
|
||||
used_bases_per_sample_mean = format(mean(samp$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE);
|
||||
used_bases_per_sample_sd = format(sd(samp$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||
sampssum<-c(sampssum, sprintf("%s +/- %s\n", used_bases_per_sample_mean, used_bases_per_sample_sd));
|
||||
|
||||
target_coverage_mean = mean(na.omit(samp$"Mean Target Coverage"));
|
||||
target_coverage_sd = sd(na.omit(samp$"Mean Target Coverage"));
|
||||
sampssum<-c(sampssum, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd));
|
||||
|
||||
pct_loci_gt_10x_mean = mean(na.omit(samp$"Target Bases 10x %"));
|
||||
pct_loci_gt_10x_sd = sd(na.omit(samp$"Target Bases 10x %"));
|
||||
sampssum<-c(sampssum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd));
|
||||
|
||||
pct_loci_gt_20x_mean = mean(na.omit(samp$"Target Bases 20x %"));
|
||||
pct_loci_gt_20x_sd = sd(na.omit(samp$"Target Bases 20x %"));
|
||||
sampssum<-c(sampssum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd));
|
||||
|
||||
pct_loci_gt_30x_mean = mean(na.omit(samp$"Target Bases 30x %"));
|
||||
pct_loci_gt_30x_sd = sd(na.omit(samp$"Target Bases 30x %"));
|
||||
sampssum<-c(sampssum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));
|
||||
|
||||
table2<-cbind(lanessum, sampssum)
|
||||
used_lanes = length(unique(paste(lane$Flowcell, lane$Lane)));
|
||||
if(nrow(lane)>used_lanes){
|
||||
colnames(table2)<-c("Per barcoded readgroup", "Per sample")
|
||||
}
|
||||
else{
|
||||
colnames(table2)<-c("Per lane", "Per sample")
|
||||
}
|
||||
rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 30x")
|
||||
par(mar=c(0,0,1,0))
|
||||
textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25, valign="top")
|
||||
title(main="Bases Summary", family="sans", cex.main=1.25, line=0)
|
||||
|
||||
print("Bases summary created...")
|
||||
|
||||
# Sequencing summary
|
||||
|
||||
instrument <- c();
|
||||
if(length(grep("AAXX", lane$Flowcell))>0){
|
||||
instrument <- c(instrument, "Illumina GA2")
|
||||
}
|
||||
if(length(grep("ABXX", lane$Flowcell))>0){
|
||||
instrument <- c(instrument, "Illumina HiSeq")
|
||||
}
|
||||
|
||||
if(length(instrument)>1){
|
||||
instrument<-paste(instrument[1], instrument[2], sep=" and ")
|
||||
}
|
||||
|
||||
used_lanes = length(unique(paste(lane$Flowcell, lane$Lane)));
|
||||
unused_lanes_by_sequencing = 0; #can we get this?
|
||||
unused_lanes_by_analysis = 0;
|
||||
|
||||
lanes_per_sample_mean = mean(table(lane$"External ID"), na.rm=TRUE);
|
||||
lanes_per_sample_sd = sd(table(lane$"External ID"), na.rm=TRUE);
|
||||
lanes_per_sample_median = median(table(lane$"External ID"));
|
||||
lanes_paired = length(unique(paste(subset(lane, lane$"Lane Type" == "Paired")$Flowcell, subset(lane, lane$"Lane Type" == "Paired")$Lane)));
|
||||
lanes_widowed = length(unique(paste(subset(lane, lane$"Lane Type" == "Widowed")$Flowcell, subset(lane, lane$"Lane Type" == "Widowed")$Lane)));
|
||||
lanes_single = length(unique(paste(subset(lane, lane$"Lane Type" == "Single")$Flowcell, subset(lane, lane$"Lane Type" == "Single")$Lane)));
|
||||
|
||||
read_length_mean = mean(lane$"Mean Read Length (P)");
|
||||
read_length_sd = sd(lane$"Mean Read Length (P)");
|
||||
read_length_median = median(lane$"Mean Read Length (P)");
|
||||
|
||||
|
||||
date = sort(as.Date(lane$"Run Date", format="%d-%b-%y"));
|
||||
|
||||
start_date = format(date[1], "%B %d, %Y");
|
||||
end_date = format(date[length(date)], "%B %d, %Y");
|
||||
|
||||
if(nrow(lane)>used_lanes){
|
||||
used_lanes<-paste(used_lanes, " (multiplexed; ", nrow(lane), " total barcoded readgroups)", sep="")
|
||||
}
|
||||
table3<-rbind(paste(instrument), used_lanes, sprintf("%s rejected by sequencing, %s by analysis\n", unused_lanes_by_sequencing, unused_lanes_by_analysis), sprintf("%0.1f +/- %0.1f lanes (median=%0.1f)\n", lanes_per_sample_mean, lanes_per_sample_sd, lanes_per_sample_median), sprintf("%s paired, %s widowed, %s single\n", lanes_paired, lanes_widowed, lanes_single), sprintf("%0.1f +/- %0.1f bases (median=%0.1f)\n", read_length_mean, read_length_sd, read_length_median), sprintf("\tSequencing dates: %s to %s\n", start_date, end_date))
|
||||
|
||||
rownames(table3)<-c("Sequencer", "Used lanes", "Unused lanes","Used lanes/sample", "Lane parities", "Read lengths", "Sequencing dates")
|
||||
par(mar=c(0,0,1,0))
|
||||
textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25, valign="top")
|
||||
title(main="Sequencing Summary", family="sans", cex.main=1.25, line=0)
|
||||
|
||||
print("Sequencing summary created...")
|
||||
|
||||
# Variant summary
|
||||
|
||||
eval.counts = basiceval$CountVariants
|
||||
if("FunctionalClass" %in% colnames(eval.counts)){
|
||||
eval.counts= subset(eval.counts, FunctionalClass == "all")
|
||||
}
|
||||
if("Sample" %in% colnames(eval.counts)){
|
||||
eval.counts= subset(eval.counts, Sample == "all")
|
||||
}
|
||||
if("Filter" %in% colnames(eval.counts)){
|
||||
eval.counts= subset(eval.counts, Filter == "called")
|
||||
}
|
||||
eval.counts.all = subset(eval.counts, Novelty == "all")$nVariantLoci;
|
||||
eval.counts.known = subset(eval.counts,Novelty == "known")$nVariantLoci;
|
||||
eval.counts.novel = subset(eval.counts, Novelty == "novel")$nVariantLoci;
|
||||
|
||||
eval.titv = basiceval$TiTvVariantEvaluator
|
||||
if("FunctionalClass" %in% colnames(eval.titv)){
|
||||
eval.titv= subset(eval.titv, FunctionalClass == "all")
|
||||
}
|
||||
if("Sample" %in% colnames(eval.titv)){
|
||||
eval.titv= subset(eval.titv, Sample == "all")
|
||||
}
|
||||
if("Filter" %in% colnames(eval.titv)){
|
||||
eval.titv= subset(eval.titv, Filter == "called")
|
||||
}
|
||||
eval.titv.all = subset(eval.titv, Novelty == "all")$tiTvRatio;
|
||||
eval.titv.known = subset(eval.titv, Novelty == "known")$tiTvRatio;
|
||||
eval.titv.novel = subset(eval.titv, Novelty == "novel")$tiTvRatio;
|
||||
|
||||
table4 = matrix(c(eval.counts.all, eval.counts.known, eval.counts.novel, eval.titv.all, eval.titv.known, eval.titv.novel, "3.0 - 3.2", "3.2 - 3.4", "2.7 - 3.0"), nrow=3);
|
||||
|
||||
rownames(table4) = c("All", "Known", "Novel");
|
||||
colnames(table4) = c("Found", "Ti/Tv ratio", "Expected Ti/Tv ratio");
|
||||
|
||||
print("Variant summary created...")
|
||||
|
||||
par(mar=c(0,0,0,0))
|
||||
textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25, valign="top")
|
||||
title(main="Variant Summary", family="sans", cex.main=1.25, line=-2)
|
||||
|
||||
eval.bysample = SAeval$CountVariants
|
||||
eval.bysample.all = subset(eval.bysample, Novelty == "all" & Sample != "all");
|
||||
eval.bysample.known = subset(eval.bysample, Novelty == "known"& Sample != "all");
|
||||
eval.bysample.novel = subset(eval.bysample, Novelty == "novel"& Sample != "all");
|
||||
|
||||
eval.bysampleTITV = SAeval$TiTvVariantEvaluator
|
||||
eval.bysampleTITV.all = subset(eval.bysampleTITV, Novelty == "all" & Sample != "all");
|
||||
eval.bysampleTITV.known = subset(eval.bysampleTITV, Novelty == "known"& Sample != "all");
|
||||
eval.bysampleTITV.novel = subset(eval.bysampleTITV, Novelty == "novel"& Sample != "all");
|
||||
|
||||
|
||||
eval.ac = basiceval$SimpleMetricsByAC.metrics
|
||||
if("FunctionalClass" %in% colnames(eval.titv)){
|
||||
eval.ac= subset(eval.ac, FunctionalClass == "all")
|
||||
}
|
||||
if("Sample" %in% colnames(eval.titv)){
|
||||
eval.ac= subset(eval.ac, Sample == "all")
|
||||
}
|
||||
if("Filter" %in% colnames(eval.titv)){
|
||||
eval.ac= subset(eval.ac, Filter == "called")
|
||||
}
|
||||
|
||||
eval.ac.all = subset(eval.ac, Novelty == "all");
|
||||
eval.ac.known = subset(eval.ac, Novelty == "known");
|
||||
eval.ac.novel = subset(eval.ac, Novelty == "novel");
|
||||
|
||||
eval.func = FCeval$CountVariants
|
||||
|
||||
par(mar=c(5, 5, 4, 2) + 0.1)
|
||||
|
||||
|
||||
boxplot(eval.bysampleTITV.all$tiTvRatio, eval.bysampleTITV.known$tiTvRatio, eval.bysampleTITV.novel$tiTvRatio, main="Ti/Tv by Sample", col=c("dark gray", "blue", "red"), names=c("All", "Known", "Novel"), ylab="Ti/Tv per sample", main="",cex=1.3, cex.lab=1.3, cex.axis=1.3);
|
||||
|
||||
par(mar=c(7, 5, 4, 2) + 0.1)
|
||||
ind = order(eval.bysample.all$nVariantLoci);
|
||||
plot(eval.bysample.all$nVariantLoci[ind], xlab="",pch=16, col="black", xaxt="n", cex=1.1, cex.lab=1.1, cex.axis=1.1, main="Variants per Sample", ylab="Number of variants\n(axis in log space)", bty="n", log="y",ylim=c(1, max(eval.bysample.all$nVariantLoci)));
|
||||
points(eval.bysample.known$nVariantLoci[ind], pch=16, col="blue", cex=1.3);
|
||||
points(eval.bysample.novel$nVariantLoci[ind], pch=16,col="red", cex=1.3);
|
||||
legend("bottomleft", max(eval.bysample.all$nVariantLoci)/2, c("All", "Known", "Novel"), , col=c("black", "blue", "red"), pt.cex=1.3, pch=16);
|
||||
if(nrow(samp)<25){
|
||||
axis(1, at=c(1:length(eval.bysample.all$Sample[ind])), lab=eval.bysample.all$Sample[ind], cex=.7, las=2 )
|
||||
}else{
|
||||
axis(1, at=c(1:nrow(samp)), lab=rep("", nrow(samp)), cex=0.1, las=2, lwd.ticks=0)
|
||||
title(xlab="Sample\n(too many individuals to label)")
|
||||
}
|
||||
|
||||
par(mar=c(6, 5, 4, 2) + 0.1)
|
||||
plot(sort(eval.ac.all$AC), eval.ac.all$n[order(eval.ac.all$AC)], ylim=c(1, max(eval.ac$n)), col="black", type="l", lwd=2, cex=1.1, cex.lab=1.1, cex.axis=1.1, xlab="Allele count\n(axis in log space)", ylab="Number of variants\n(axis in log space)", main="Variants by Allele Count", log="xy", bty="n");
|
||||
points(sort(eval.ac.known$AC), eval.ac.known$n[order(eval.ac.known$AC)], col="blue", type="l", lwd=2);
|
||||
points(sort(eval.ac.novel$AC), eval.ac.novel$n[order(eval.ac.novel$AC)], col="red", type="l", lwd=2);
|
||||
if(nrow(samp)<25){
|
||||
legend("bottomleft", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2);
|
||||
}else{
|
||||
legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2);
|
||||
}
|
||||
par(mar=c(5, 5, 4, 2) + 0.1)
|
||||
|
||||
barplot(eval.func$nVariantLoci[4:nrow(eval.func)], col=c("dark gray", "blue", "red"), space=c(.2,0,0), log="y", main="Variants by Functional Class", xlab="Functional Class", ylab="Number of variants\n(axis in log space)")
|
||||
axis(1, at=c(1.5,5,8.5), lab=c("Missense", "Nonsense", "Silent"), cex=.5, tick=FALSE)
|
||||
legend("top", c("All", "Known", "Novel"), fill=c("dark gray", "blue", "red"), cex=.7);
|
||||
|
||||
print("Graphs created...")
|
||||
|
||||
print("All done!")
|
||||
par(def.par)#- reset to default
|
||||
}
|
||||
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
source("/humgen/gsa-pipeline/.repository/R/DataProcessingReport/qcplots.r")
|
||||
suppressMessages(library(gplots));
|
||||
def.par <- par(no.readonly = TRUE)
|
||||
|
||||
|
||||
cmdargs = gsa.getargs(
|
||||
list(
|
||||
tsv = list(value=NA, doc="pipeline tsv file"),
|
||||
evalroot = list(value=NA, doc="VariantEval file base (everything before the .eval)"),
|
||||
reportout = list(value=NA, doc="Output path for report PDF")#,
|
||||
),
|
||||
doc="Creates a variant report"
|
||||
);
|
||||
|
||||
read.delim(cmdargs$tsv, header=FALSE)->settable
|
||||
|
||||
squids<-unique(settable[,1])
|
||||
|
||||
|
||||
gsa.read.gatkreport(paste(cmdargs$evalroot, ".eval", sep=""))->basiceval
|
||||
gsa.read.gatkreport(paste(cmdargs$evalroot, ".extraSA.eval", sep=""))->SAeval
|
||||
print("Evals read")
|
||||
|
||||
pdf(file= cmdargs$reportout, width=22, height=17, pagecentre=TRUE, pointsize=24)
|
||||
print("PDF created...")
|
||||
|
||||
|
||||
path="."
|
||||
weirdos<-which(SAeval$TiTvVariantEvaluator$Sample %in% SAeval$TiTvVariantEvaluator$Sample[which(SAeval$TiTvVariantEvaluator$tiTvRatio <2)])
|
||||
|
||||
novelAC(SAeval)
|
||||
knownAC(SAeval)
|
||||
AllAC(SAeval)
|
||||
layout(matrix(c(6,1, 2,3, 4, 5), nrow=6), heights=c(1, 1, 1, 1, 1,1))
|
||||
textplot("Sample Novel TiTv ranges should be above 2, as they are in previous datasets. \nSamples with lower TiTv data are flagged in subsequent plots with hot pink labels, and listed below:")
|
||||
textplot(paste(unique(SAeval$TiTvVariantEvaluator$Sample[weirdos]), collapse=", "), halign="left")
|
||||
textplot("Problem Samples frequently have unusually high or low numbers of variants.")
|
||||
textplot("Samples with unusually high numbers of novel variants may be from different populations, and, as such, should have higher heterozygosity. \nIf this is not the case, there may be problems with the samples.")
|
||||
textplot("Unusually high numbers of variants with low allele counts may indicate variants generated from problematic samples.")
|
||||
textplot("Notes for interpreting QC data:")
|
||||
dev.off()
|
||||
|
|
@ -1,177 +0,0 @@
|
|||
#preqc.r
|
||||
library(gplots)
|
||||
.libPaths('/humgen/gsa-pipeline/.repository/R/')
|
||||
library(gsalib)
|
||||
|
||||
cmdargs = gsa.getargs(
|
||||
list(
|
||||
tsv = list(value=NA, doc="pipeline tsv file"),
|
||||
qcout=list(value=NA, doc="path to output root")
|
||||
),
|
||||
doc="Creates a tearsheet"
|
||||
);
|
||||
|
||||
read.delim(cmdargs$tsv, header=FALSE)->settable
|
||||
|
||||
squids<-unique(settable[,1])
|
||||
print(paste(nrow(settable), "samples in tsv"))
|
||||
lane<-data.frame()
|
||||
samp<-data.frame()
|
||||
for(squid in squids){
|
||||
gsa.read.squidmetrics(squid, TRUE)->lanemetrics
|
||||
print(paste("Got lane metrics for", squid))
|
||||
addlanes<-lanemetrics[which(lanemetrics$"External ID" %in% settable[,2]),]
|
||||
gsa.read.squidmetrics(squid, FALSE)->samplemetrics
|
||||
print(paste("Got sample metrics for", squid))
|
||||
addsamps<-samplemetrics[which(samplemetrics$Sample %in% settable[,2]),]
|
||||
lane<-rbind(lane, addlanes)
|
||||
samp<-rbind(samp, addsamps)
|
||||
}
|
||||
|
||||
print(paste(nrow(samp), "samples in samp"))
|
||||
print(paste(length(unique(lane$"External ID")), "samples in lane"))
|
||||
|
||||
print(paste(setdiff(settable[,2], samp$Sample), "do not overlap between samp and tsv"))
|
||||
print(paste(setdiff(settable[,2], lane$"External ID"), "do not overlap between lane and tsv"))
|
||||
print(paste(setdiff(samp$Sample, lane$"External ID"), "do not overlap between lane and samp"))
|
||||
|
||||
missingSamp<-setdiff(settable[,2], samp$Sample)
|
||||
missingLane<-setdiff(settable[,2], lane$"External ID")
|
||||
|
||||
drv = dbDriver("Oracle");
|
||||
con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD");
|
||||
|
||||
rs = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS"));
|
||||
d = fetch(rs, n=-1);
|
||||
dbHasCompleted(rs);
|
||||
dbClearResult(rs);
|
||||
|
||||
rs2 = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG"));
|
||||
d2 = fetch(rs2, n=-1);
|
||||
dbHasCompleted(rs2);
|
||||
dbClearResult(rs2);
|
||||
|
||||
oraCloseDriver(drv);
|
||||
|
||||
compsamp=d2[which(d2$"Bait Set" %in% samp$"Bait Set"),]
|
||||
complane=d[which(d$"Bait Set" %in% lane$"Bait Set"),]
|
||||
|
||||
|
||||
|
||||
pdf(paste(cmdargs$qcout, "pdf", sep="."), width=11, height=8.5)
|
||||
|
||||
plot(samp$"Target Bases 20x %", main="Coverage to 20x", ylab="% Targets Covered to 20x", xlab="Sample", ylim=c(0,100))
|
||||
abline(h=80, lty=2)
|
||||
legend("bottomright", lty=2, legend="80% coverage to 20x")
|
||||
lowcoverage<-samp$Sample[which(samp$"Target Bases 20x %"<80)]
|
||||
if(length(lowcoverage)>0){
|
||||
text(which(samp$"Target Bases 20x %"<80),samp$"Target Bases 20x %"[which(samp$"Target Bases 20x %"<80)], labels=samp$Sample[which(samp$"Target Bases 20x %"<80)], pos=2, srt=270, cex=.6, col="hotpink")
|
||||
}
|
||||
|
||||
plot(samp$"Zero Coverage Targets %", main="Zero Coverage", ylab="% Targets with zero coverage", log="y", xlab="Sample", ylim=c(0.01,100))
|
||||
abline(h=3, lty=2)
|
||||
legend("bottomright", lty=2, legend="3% Targets Zero Coverage")
|
||||
lowcoverage<-c(lowcoverage,samp$Sample[which(samp$"Zero Coverage">3)])
|
||||
if(length(which(samp$"Zero Coverage Targets %">3))>0){
|
||||
text(which(samp$"Zero Coverage Targets %">3), samp$"Zero Coverage Targets %"[which(samp$"Zero Coverage Targets %">3)], labels=samp$Sample[which(samp$"Zero Coverage Targets %">3)], pos=2, srt=270, cex=.6, col="hotpink")
|
||||
}
|
||||
|
||||
print("Coverage stats done")
|
||||
nofp<-lane$"External ID"[which(is.na(lane$"FP LOD"))]
|
||||
|
||||
if(length(which(is.na(lane$"FP LOD")))< nrow(lane)){
|
||||
|
||||
plot(lane$"FP Confident Calls"~as.factor(lane$"External ID"), xlab="sample", ylab="Multiplex level # FP calls", main="Fingerprint Calls/Sample Instance", xaxt="n")
|
||||
medians<-tapply(lane$"FP Confident Calls",lane$"External ID", median, na.rm=TRUE)
|
||||
points(as.factor(dimnames(medians)[[1]]),medians,col="red", lwd=2)
|
||||
legend("topleft", legend="Median across sample instances", pch=1, lwd=2, col="red", lty=0)
|
||||
poorFPcov<-dimnames(medians)[[1]][which(medians<5 )]
|
||||
if(length(poorFPcov)>0){
|
||||
text(which(medians<5), medians[which(medians<5)],poorFPcov, pos=2, srt=270, cex=.6, col="hotpink")
|
||||
}
|
||||
|
||||
print("1 fp plot")
|
||||
plot(100*(lane$"FP Confident Matching SNPs"/lane$"FP Confident Calls")~as.factor(lane$"External ID"), xlab="sample", ylab="Multiplex level % matching FP calls", main="% Confident calls matching for samples with low confident calls", xaxt="n", ylim=c(0,110))
|
||||
|
||||
print("2 fp plot")
|
||||
|
||||
plot(lane$"FP LOD"~as.factor(lane$"External ID"), xlab="sample", ylab="Sample Fingerprint LOD", main="Fingerprint Pass:Samples", xaxt="n")
|
||||
offsamps<-lane$"External ID"[which(lane$"FP LOD"<(-3))]
|
||||
lowfpLOD<-lane$"External ID"[which(lane$"FP LOD"<6)]
|
||||
|
||||
if(length(lowfpLOD)>0){
|
||||
text(which(lane$"External ID" %in% lowfpLOD), lane$"FP_LOD"[which(lane$"FP LOD"<6)], labels=lowfpLOD, pos=2, srt=270, cex=.6, col="hotpink")
|
||||
}
|
||||
print("3 fp plot")
|
||||
|
||||
if(length(lowfpLOD)>0){
|
||||
plot((lane$"FP Confident Calls"-lane$"FP Confident Matching SNPs")~as.factor(lane$"External ID"), main="Calls vs Matching Calls for Samples failing FP QC", ylab="# Mismatches", xlab="")
|
||||
}
|
||||
if(length(lowfpLOD)>0){
|
||||
text(which(lane$"FP LOD"<6), lane$"FP_LOD"[which(lane$"FP LOD"<6)], labels=lowfpLOD, pos=2, srt=270, cex=.6, col="RED")
|
||||
}
|
||||
|
||||
|
||||
}else{
|
||||
offsamps<-"NO FPDATA"
|
||||
lowfpLOD<-"NO FP DATA"
|
||||
poorFPcov<-"NO FP DATA"
|
||||
}
|
||||
print("FP stats done")
|
||||
|
||||
boxplot(samp$"Total SNPs", compsamp$"Total SNPs", names=c("Current Set", "All Sets"), ylab="Total SNPs per sample", main="Total SNPs")
|
||||
standardQuants<-boxplot.stats(compsamp$"Total SNPs")$stats
|
||||
offSNPs<-samp$Sample[which(samp$"Total SNPs" <standardQuants[1])]
|
||||
offSNPs<-c(offSNPs, samp$Sample[which(samp$"Total SNPs" >standardQuants[5])])
|
||||
if(length(offSNPs >0)){
|
||||
text(1, samp$"Total SNPs"[which(samp$Sample %in% offSNPs)], labels=offSNPs, pos=2, col="hot pink")
|
||||
}
|
||||
print("SNP stats done")
|
||||
|
||||
boxplot(samp$"dbSNP %", compsamp$"dbSNP %", names=c("Current Set", "All Sets"), ylab="% SNPs in dbSNP per sample", main="dbSNP Percentage")
|
||||
standardQuants<-boxplot.stats(compsamp$"dbSNP %")$stats
|
||||
offdbSNP<-samp$Sample[which(samp$"dbSNP %" <standardQuants[1])]
|
||||
offdbSNP<-c(offdbSNP, samp$Sample[which(samp$"dbSNP %" >standardQuants[5])])
|
||||
if(length(offdbSNP >0)){
|
||||
text(1, samp$"dbSNP %"[which(samp$Sample %in% offdbSNP)], labels=offdbSNP, pos=2, col="hot pink")
|
||||
}
|
||||
print("DBSNP stats done")
|
||||
|
||||
sampDuplication<-sub(pattern="Catch-.*: ", "",samp$"Library Duplication %")
|
||||
sampDuplication<-as.numeric(sub("%", "", sampDuplication))
|
||||
compsampDuplication<-sub(pattern="Catch-.*: ", "",compsamp$"Library Duplication %")
|
||||
compsampDuplication<-as.numeric(sub("%", "", compsampDuplication))
|
||||
|
||||
boxplot(sampDuplication, compsampDuplication, names=c("Current Set", "All Sets"), ylab="% Duplication", main="Library Duplication")
|
||||
standardQuants<-boxplot.stats(compsampDuplication)$stats
|
||||
offDup<-samp$Sample[which(sampDuplication <standardQuants[1])]
|
||||
offDup<-c(offDup, samp$Sample[which(sampDuplication >standardQuants[5])])
|
||||
if(length(offDup >0)){
|
||||
text(1, sampDuplication[which(samp$Sample %in% offDup)], labels=offDup, pos=2, col="hot pink")
|
||||
}
|
||||
print("Duplication stats done")
|
||||
|
||||
allproblemsamples<-unique(c(lowcoverage, poorFPcov, offsamps, lowfpLOD, offSNPs, offdbSNP, offDup, missingLane, missingSamp))
|
||||
problemMat<-matrix(c(rep("PASS", length(allproblemsamples)*9)), nrow=length(allproblemsamples))
|
||||
rownames(problemMat)<-allproblemsamples
|
||||
colnames(problemMat)<-c("low coverage", "low fp cov", "Identity Fail", "low FP LOD", "weird SNP count", "weird dbSNP %", "Duplicated", "Missing lane data", "missing agg data")
|
||||
problemMat[which(rownames(problemMat) %in% lowcoverage),1]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% poorFPcov),2]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% offsamps),2]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% lowfpLOD),4]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% offSNPs),5]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% offdbSNP),6]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% offDup),7]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% missingLane),8]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% missingSamp),9]<-"FAIL"
|
||||
|
||||
textplot(problemMat, cex=.5)
|
||||
|
||||
write.table(problemMat, file=paste(cmdargs$qcout,"qc.table",sep="."), quote=FALSE, sep="\t")
|
||||
print("no fp")
|
||||
print(unique(nofp))
|
||||
|
||||
|
||||
|
||||
dev.off()
|
||||
print("All stats done")
|
||||
|
|
@ -1,181 +0,0 @@
|
|||
.libPaths('/humgen/gsa-firehose2/pipeline/repositories/StingProduction/R/')
|
||||
.libPaths('~/Documents/Sting/R/')
|
||||
|
||||
library(gsalib)
|
||||
def.par <- par(no.readonly = TRUE)
|
||||
|
||||
titvplot<-function(current){
|
||||
par(mfcol=c(1,2))
|
||||
titvs<-c()
|
||||
status<-c()
|
||||
for(i in c(1:12)){
|
||||
load(sprintf("%sexome.%i", path, i));
|
||||
info<-subset(data$TiTvVariantEvaluator, Sample!="all")
|
||||
titvs<-c(titvs, info$tiTvRatio)
|
||||
status<-c(status, info$Novelty)
|
||||
print(length(titvs))
|
||||
print(length(status))
|
||||
}
|
||||
print(length(unique(current$TiTvVariantEvaluator$Sample))-1)
|
||||
|
||||
length(unique(current$TiTvVariantEvaluator$Sample))-1+length(titvs[which(status=="novel")])->nvalues
|
||||
print(length(titvs[which(status=="novel")]))
|
||||
print(nvalues)
|
||||
plot(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="novel")], xlim=c(0,nvalues), ylim=c(0,4), col="red", main="Current samples compared to previous samples from 12 sets", ylab="Per sample Ti/Tv", xlab="sample")
|
||||
points(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="known")], col="blue")
|
||||
points(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="all")], col="black")
|
||||
points(c(length(unique(current$TiTvVariantEvaluator$Sample)):nvalues), titvs[which(status=="novel")], pch=16, col="red")
|
||||
points(c(length(unique(current$TiTvVariantEvaluator$Sample)):nvalues), titvs[which(status=="known")], pch=16, col="blue")
|
||||
points(c(length(unique(current$TiTvVariantEvaluator$Sample)):nvalues), titvs[which(status=="all")], pch=16, col="black")
|
||||
|
||||
legend("bottomleft", col=c("red", "blue", "black"), pch=c(1,1,1,16,16, 16),legend=c("novel variants:current set", "known variants:current set", "all varaints:current set", "novel variants:previous sets", "known variants:previous sets", "all variants: previous sets"))
|
||||
weirdos<-which(current$TiTvVariantEvaluator$Sample %in% current$TiTvVariantEvaluator$Sample[which(current$TiTvVariantEvaluator$tiTvRatio <2.0)])
|
||||
if(length(weirdos)>0){
|
||||
text(weirdos[c(1:(length(weirdos)/3))],current$TiTvVariantEvaluator$tiTvRatio[weirdos], labels=current$TiTvVariantEvaluator$Sample[weirdos], pos=4, cex=.7, col="hot pink")
|
||||
}
|
||||
|
||||
boxplot(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="novel")],titvs[which(status=="novel")], current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="known")],titvs[which(status=="known")], current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="all")], titvs[which(status=="all")], col=rep(c("red", "blue", "black"), each=2), main="Current v. Previous per sample Ti/TV", xlab="Sample Sets",ylab="Ti/Tv per sample", xaxt="n" )
|
||||
axis(side=1, at=c(1:6)-.2, labels=rep(c("current", "previous"), 3), cex.axis=.7)
|
||||
legend("bottomleft",legend=c("novel", "known", "all"), fill=c("red", "blue", "black"))
|
||||
if(length(weirdos)>0){
|
||||
text(rep(c(5,3,1), each=(length(weirdos)/3)),current$TiTvVariantEvaluator$tiTvRatio[weirdos], labels=current$TiTvVariantEvaluator$Sample[weirdos], pos=4, cex=.7, col="hot pink")
|
||||
}
|
||||
par(def.par)#- reset to default
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
variantplots<-function(current){
|
||||
par(mfcol=c(1,2))
|
||||
|
||||
variants<-c()
|
||||
status<-c()
|
||||
for(i in c(1:12)){
|
||||
load(sprintf("%s/exome.%i", path, i));
|
||||
info<-subset(data$CountVariants, Sample!="all")
|
||||
variants<-c(variants, info$nSNPs)
|
||||
status<-c(status, info$Novelty)
|
||||
}
|
||||
|
||||
length(unique(current$CountVariants$Sample))-1+length(variants[which(status=="novel")])->nvalues
|
||||
plot(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")], xlim=c(0,nvalues), ylim=c(1,25000), log="y", col="red", main="Current samples compared to previous samples from 12 sets", ylab="Per sample #SNPs", xlab="sample")
|
||||
points(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")], col="blue")
|
||||
points(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], col="black")
|
||||
points(c(length(unique(current$CountVariants$Sample)):nvalues), variants[which(status=="novel")], pch=16, col="red")
|
||||
points(c(length(unique(current$CountVariants$Sample)):nvalues), variants[which(status=="known")], pch=16, col="blue")
|
||||
points(c(length(unique(current$CountVariants$Sample)):nvalues), variants[which(status=="all")], pch=16, col="black")
|
||||
|
||||
legend("bottomleft", col=c("red", "blue", "black"), pch=c(1,1,1,16,16, 16),legend=c("novel variants:current set", "known variants:current set", "all varaints:current set", "novel variants:previous sets", "known variants:previous sets", "all variants: previous sets"))
|
||||
|
||||
weirdos<-which(current$CountVariants$Sample %in% current$TiTvVariantEvaluator$Sample[which(current$TiTvVariantEvaluator$tiTvRatio <2.0)])
|
||||
if(length(weirdos)>0){
|
||||
|
||||
text(weirdos[c(1:(length(weirdos)/3))],current$CountVariants$nSNPs[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink")
|
||||
}
|
||||
|
||||
boxplot(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")],variants[which(status=="novel")], current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")],variants[which(status=="known")], current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], variants[which(status=="all")], col=rep(c("red", "blue", "black"), each=2), main="Current v. Previous per sample #SNPs", xlab="Sample Sets",ylab="SNPs per sample", xaxt="n", ylim=c(10,25000), log="y")
|
||||
axis(side=1, at=c(1:6)-.2, labels=rep(c("current", "previous"), 3), cex.axis=.7)
|
||||
if(length(weirdos)>0){
|
||||
|
||||
text(rep(c(5,3,1), each=(length(weirdos)/3)),current$CountVariants$nSNPs[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink")
|
||||
}
|
||||
legend("topleft",legend=c("novel", "known", "all"), fill=c("red", "blue", "black"))
|
||||
par(def.par)#- reset to default
|
||||
|
||||
}
|
||||
|
||||
heteroplots<-function(current){
|
||||
par(mfcol=c(1,2))
|
||||
|
||||
hets<-c()
|
||||
status<-c()
|
||||
for(i in c(1:12)){
|
||||
load(sprintf("%s/exome.%i", path, i));
|
||||
info<-subset(data$CountVariants, Sample!="all")
|
||||
hets<-c(hets, info$heterozygosity)
|
||||
status<-c(status, info$Novelty)
|
||||
}
|
||||
|
||||
length(unique(current$CountVariants$Sample))-1+length(hets[which(status=="novel")])->nvalues
|
||||
plot(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")], xlim=c(0,nvalues), ylim=c(-0.0005, 0.0005), col="red", main="Current samples compared to previous samples from 12 sets", ylab="Per sample heterozygosity", xlab="sample")
|
||||
points(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")], col="blue")
|
||||
points(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], col="black")
|
||||
points(c(length(unique(current$CountVariants$Sample)):nvalues), hets[which(status=="novel")], pch=16, col="red")
|
||||
points(c(length(unique(current$CountVariants$Sample)):nvalues), hets[which(status=="known")], pch=16, col="blue")
|
||||
points(c(length(unique(current$CountVariants$Sample)):nvalues), hets[which(status=="all")], pch=16, col="black")
|
||||
|
||||
legend("bottomleft", col=c("red", "blue", "black"), pch=c(1,1,1,16,16, 16),legend=c("novel variants:current set", "known variants:current set", "all varaints:current set", "novel variants:previous sets", "known variants:previous sets", "all variants: previous sets"))
|
||||
|
||||
weirdos<-which(current$CountVariants$Sample %in% current$TiTvVariantEvaluator$Sample[which(current$TiTvVariantEvaluator$tiTvRatio <2.0)])
|
||||
if(length(weirdos)>0){
|
||||
text(weirdos[c(1:(length(weirdos)/3))],current$CountVariants$heterozygosity[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink")
|
||||
}
|
||||
|
||||
boxplot(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")],hets[which(status=="novel")], current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")],hets[which(status=="known")], current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], hets[which(status=="all")], col=rep(c("red", "blue", "black"), each=2), main="Current v. Previous per sample #Heterozygousity", xlab="Sample Sets",ylab="Heterozygousity per sample", xaxt="n")
|
||||
axis(side=1, at=c(1:6)-.2, labels=rep(c("current", "previous"), 3), cex.axis=.7)
|
||||
if(length(weirdos)>0){
|
||||
|
||||
text(rep(c(5,3,1), each=(length(weirdos)/3)),current$CountVariants$heterozygosity[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink")
|
||||
}
|
||||
legend("topleft",legend=c("novel", "known", "all"), fill=c("red", "blue", "black"))
|
||||
par(def.par)#- reset to default
|
||||
|
||||
}
|
||||
|
||||
novelAC<-function(current){
|
||||
ACs<-sort(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="novel")])
|
||||
orderbyAC<-order(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="novel")])
|
||||
varbyAC<-current$SimpleMetricsByAC.metrics$n[which(current$SimpleMetricsByAC.metrics$Novelty=="novel")][orderbyAC]
|
||||
plot(ACs, varbyAC, type="l", log="xy", lwd=4, col="dark red", main="Novel AC", ylab="# variants (log scale)", xlab="AC (log scale)")
|
||||
|
||||
for(i in c(1:12)){
|
||||
load(sprintf("%s/exome.%i", path, i));
|
||||
info<-data$SimpleMetricsByAC.metrics
|
||||
ACs<-sort(info$AC[which(info$Novelty=="novel")])
|
||||
orderbyAC<-order(info$AC[which(info$Novelty=="novel")])
|
||||
varbyAC<-info$n[which(info$Novelty=="novel")][orderbyAC]
|
||||
|
||||
lines(ACs, varbyAC, col="red")
|
||||
}
|
||||
|
||||
legend("topright",legend=c("current", "previous"), lwd=c(4,1), col=c("dark red", "red"))
|
||||
}
|
||||
|
||||
knownAC<-function(current){
|
||||
ACs<-sort(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="known")])
|
||||
orderbyAC<-order(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="known")])
|
||||
varbyAC<-current$SimpleMetricsByAC.metrics$n[which(current$SimpleMetricsByAC.metrics$Novelty=="known")][orderbyAC]
|
||||
plot(ACs, varbyAC, type="l", log="xy", lwd=4, col="dark blue", main="Known AC", ylab="# variants (log scale)", xlab="AC (log scale)")
|
||||
|
||||
for(i in c(1:12)){
|
||||
load(sprintf("%s/exome.%i", path, i));
|
||||
info<-data$SimpleMetricsByAC.metrics
|
||||
ACs<-sort(info$AC[which(info$Novelty=="known")])
|
||||
orderbyAC<-order(info$AC[which(info$Novelty=="known")])
|
||||
varbyAC<-info$n[which(info$Novelty=="known")][orderbyAC]
|
||||
lines(ACs, varbyAC, col="light blue")
|
||||
}
|
||||
|
||||
legend("topright",legend=c("current", "previous"), lwd=c(4,1), col=c("dark blue", "light blue"))
|
||||
}
|
||||
|
||||
AllAC<-function(current){
|
||||
ACs<-sort(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="all")])
|
||||
orderbyAC<-order(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="all")])
|
||||
varbyAC<-current$SimpleMetricsByAC.metrics$n[which(current$SimpleMetricsByAC.metrics$Novelty=="all")][orderbyAC]
|
||||
plot(ACs, varbyAC, type="l", log="xy", lwd=4, col="Black", main="All AC", ylab="# variants (log scale)", xlab="AC (log scale)")
|
||||
|
||||
for(i in c(1:12)){
|
||||
load(sprintf("%s/exome.%i", path, i));
|
||||
info<-data$SimpleMetricsByAC.metrics
|
||||
ACs<-sort(info$AC[which(info$Novelty=="all")])
|
||||
orderbyAC<-order(info$AC[which(info$Novelty=="all")])
|
||||
varbyAC<-info$n[which(info$Novelty=="all")][orderbyAC]
|
||||
|
||||
lines(ACs, varbyAC, col="dark grey")
|
||||
}
|
||||
|
||||
legend("topright",legend=c("current", "previous"), lwd=c(4,1), col=c("black", "dark grey"))
|
||||
}
|
||||
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
source("/humgen/gsa-pipeline/.repository/R/DataProcessingReport/Tearsheet.R")
|
||||
cmdargs = gsa.getargs(
|
||||
list(
|
||||
title = list(value=NA, doc="Title for the tearsheet"),
|
||||
tsv = list(value=NA, doc="pipeline tsv file"),
|
||||
evalroot = list(value=NA, doc="VariantEval file base (everything before the .eval)"),
|
||||
tearout = list(value=NA, doc="Output path for tearsheet PDF")#,
|
||||
),
|
||||
doc="Creates a tearsheet"
|
||||
);
|
||||
|
||||
read.delim(cmdargs$tsv, header=FALSE)->settable
|
||||
|
||||
squids<-unique(settable[,1])
|
||||
|
||||
lane<-data.frame()
|
||||
samp<-data.frame()
|
||||
for(squid in squids){
|
||||
gsa.read.squidmetrics(squid, TRUE)->lanemetrics
|
||||
addlanes<-lanemetrics[which(lanemetrics$"External ID" %in% settable[,2]),]
|
||||
gsa.read.squidmetrics(squid, FALSE)->samplemetrics
|
||||
addsamps<-samplemetrics[which(samplemetrics$"Sample" %in% settable[,2]),]
|
||||
lane<-rbind(lane, addlanes)
|
||||
samp<-rbind(samp, addsamps)
|
||||
}
|
||||
print("Picard Data Obtained...")
|
||||
gsa.read.gatkreport(paste(cmdargs$evalroot, ".eval", sep=""))->basiceval
|
||||
gsa.read.gatkreport(paste(cmdargs$evalroot, ".extraFC.eval", sep=""))->FCeval
|
||||
gsa.read.gatkreport(paste(cmdargs$evalroot, ".extraSA.eval", sep=""))->SAeval
|
||||
print("Evals read")
|
||||
pdf(file= cmdargs$tearout, width=22, height=17, pagecentre=TRUE, pointsize=24)
|
||||
print("PDF created...")
|
||||
tearsheet()
|
||||
dev.off()
|
||||
|
|
@ -1,194 +0,0 @@
|
|||
require("plotrix")
|
||||
args = commandArgs(TRUE);
|
||||
|
||||
onCMDLine = ! is.na(args[1])
|
||||
if (! is.na(args[3]) ) { name = args[3] } else { name = "" }
|
||||
|
||||
if ( onCMDLine ) {
|
||||
print(paste("Reading data from", args[1]))
|
||||
d = read.table(args[1], header=T, sep="\t")
|
||||
#d$start.time = as.Date(d$start.time)
|
||||
d$end.time = as.Date(d$end.time)
|
||||
} # only read into d if its' available, otherwise assume the data is already loaded
|
||||
|
||||
# The unknown records are from the Broad
|
||||
d$domain.name[d$domain.name == "unknown"] = "broadinstitute.org"
|
||||
|
||||
noRecords <- function(name) {
|
||||
print(paste("No records", name))
|
||||
frame()
|
||||
title(paste("No records of", name), cex=2)
|
||||
}
|
||||
|
||||
reportCountingPlot <- function(values, name, moreMargin = 0, ...) {
|
||||
#print(length(values))
|
||||
if ( length(values) > 0 ) {
|
||||
par(las=2) # make label text perpendicular to axis
|
||||
oldMar <- par("mar")
|
||||
par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin.
|
||||
t = table(factor(values))
|
||||
barplot(sort(t), horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", log="x", ...)
|
||||
par("mar" = oldMar)
|
||||
par("las" = 1)
|
||||
} else {
|
||||
noRecords(name)
|
||||
}
|
||||
}
|
||||
|
||||
reportConditionalCountingPlot <- function(values, conditions, name, moreMargin = 0, ...) {
|
||||
if ( length(values) > 0 ) {
|
||||
t = table(values, conditions)
|
||||
t = t[, order(colSums(t))]
|
||||
#print(list(t = t))
|
||||
if ( ! is.null(dim(t)) ) {
|
||||
par(las=2) # make label text perpendicular to axis
|
||||
oldMar <- par("mar")
|
||||
par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin.
|
||||
nconds = dim(t)[2]
|
||||
cols = rainbow(nconds)
|
||||
barplot(t, legend.text = T, horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", col=cols, cex=0.5, ...)
|
||||
par("mar" = oldMar)
|
||||
par("las" = 1)
|
||||
} else {
|
||||
noRecords(name)
|
||||
}
|
||||
} else {
|
||||
noRecords(name)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
reportHist <- function(values, name, ...) {
|
||||
if ( ! all(is.na(values) ) )
|
||||
hist(values, main=name, 20, xlab="", col="cornflowerblue", ...)
|
||||
}
|
||||
|
||||
myTable <- function(x, y, reqRowNonZero = F) {
|
||||
table <- prop.table(table(x, y), 2)
|
||||
ncols = dim(table)[2]
|
||||
|
||||
#print(table)
|
||||
if ( reqRowNonZero )
|
||||
table = table[addmargins(table)[1:dim(table)[1],ncols] > 0,]
|
||||
|
||||
return(table)
|
||||
}
|
||||
|
||||
# todo -- must be robust to smaller sizes
|
||||
|
||||
plotTable <- function(table, name, ...) {
|
||||
ncols = dim(table)[2]
|
||||
nrows = dim(table)[1]
|
||||
if ( ! is.null(nrows) ) {
|
||||
cols = rainbow(nrows)
|
||||
tableMin = min(apply(table, 2, min))
|
||||
tableMax = max(apply(table, 2, max))
|
||||
plot( as.numeric(apply(table, 2, sum)), ylim=c(tableMin, tableMax), type="n", main = name, ylab="Frequency", xlab="Date", xaxt="n", ...)
|
||||
axis(1, 1:ncols, labels=colnames(table))
|
||||
for ( i in 1:nrows )
|
||||
points(table[i,], type="b", col=cols[i])
|
||||
legend("topright", row.names(table), fill=cols, cex=0.5)
|
||||
#return(table)
|
||||
}
|
||||
}
|
||||
|
||||
RUNNING_GATK_RUNTIME <- 60 * 5 # 5 minutes => bad failure
|
||||
|
||||
if ( onCMDLine ) pdf(args[2])
|
||||
|
||||
successfulRuns <- function(d) {
|
||||
x <- rep("Successful", length(d$exception.msg))
|
||||
x[d$exception.msg != "NA" & d$is.user.exception == "true"] <- "Failed with UserException"
|
||||
x[d$exception.msg != "NA" & d$is.user.exception == "false"] <- "Failed with StingException"
|
||||
x[d$exception.msg != "NA" & (d$is.user.exception == "NA" | is.na(d$is.user.exception))] <- "Failed with StingException before UserException code"
|
||||
return(x)
|
||||
}
|
||||
|
||||
addSection <- function(name) {
|
||||
par("mar", c(5, 4, 4, 2))
|
||||
frame()
|
||||
title(name, cex=2)
|
||||
}
|
||||
|
||||
dropit <- function (d, columns = names(d), ...)
|
||||
{
|
||||
d[columns] = lapply(d[columns], "[", drop=TRUE, ...)
|
||||
d
|
||||
}
|
||||
|
||||
generateOneReport <- function(d, header, includeByWeek = T) {
|
||||
head <- function(s) {
|
||||
return(paste("Section:", header, "\n", s))
|
||||
}
|
||||
|
||||
excepted <- dropit(subset(d, exception.msg != "NA"))
|
||||
UserExceptions <- dropit(subset(excepted, is.user.exception == "true"))
|
||||
StingExceptions <- dropit(subset(excepted, is.user.exception == "false" | is.user.exception == "NA" | is.na(is.user.exception)))
|
||||
|
||||
addSection(paste("GATK run report", name, "for", Sys.Date(), "\nwith", dim(d)[1], "run repository records"))
|
||||
|
||||
reportCountingPlot(d$walker.name, head("Walker invocations"))
|
||||
reportConditionalCountingPlot(d$user.name, d$walker.name, head("Walker invocations by user"))
|
||||
reportCountingPlot(d$svn.version, head("SVN version"))
|
||||
reportConditionalCountingPlot(d$svn.version, d$user.name, head("SVN by user"))
|
||||
|
||||
# cuts by time
|
||||
if ( includeByWeek ) {
|
||||
plotTable(table(rep("GATK Invocations", length(d$end.time)), cut(d$end.time, "weeks")), head("GATK Invocations by week"))
|
||||
plotTable(myTable(successfulRuns(d), cut(d$end.time, "weeks")), head("Successful and failing GATK invocations per week"))
|
||||
|
||||
plotTable(myTable(d$svn.version, cut(d$end.time, "weeks")), head("SVN version by week"))
|
||||
}
|
||||
plotTable(table(rep("GATK Invocations", length(d$end.time)), d$end.time), head("GATK Invocations by day"))
|
||||
plotTable(myTable(d$svn.version, d$end.time), head("SVN version by day"))
|
||||
|
||||
#
|
||||
# Exception handling
|
||||
#
|
||||
addExceptionSection <- function(subd, subname, exceptionColor) {
|
||||
addSection(paste(subname))
|
||||
#print(list(subd = length(subd$end.time), name=subname))
|
||||
reportCountingPlot(subd$walker.name, head(paste("Walkers with", subname)), col=exceptionColor)
|
||||
reportCountingPlot(subd$exception.at, head(paste(subname, "locations")), 12, col=exceptionColor)
|
||||
#reportCountingPlot(subd$exception.msg, head(paste(subname, "messages")), 12, col=exceptionColor)
|
||||
reportConditionalCountingPlot(subd$user.name, subd$exception.at, head(paste("Walker invocations by user for", subname)), 12)
|
||||
|
||||
if ( includeByWeek && length(subd$end.time) > 0 ) {
|
||||
plotTable(myTable(subd$walker.name, cut(subd$end.time, "weeks"), reqRowNonZero = T), head(paste("Walkers with", subname,"by week")), col=exceptionColor)
|
||||
}
|
||||
}
|
||||
|
||||
addExceptionSection(excepted, "Exceptions", "grey")
|
||||
reportCountingPlot(excepted$user.name, head("Usernames generating exceptions"), col="grey")
|
||||
|
||||
addExceptionSection(StingExceptions, "StingExceptions", "red")
|
||||
addExceptionSection(UserExceptions, "UserExceptions", "blue")
|
||||
|
||||
|
||||
Gb <- 1024^3
|
||||
reportHist(d$total.memory / Gb, head("Used memory"))
|
||||
reportHist(d$max.memory / Gb, head("Max memory"))
|
||||
|
||||
min <- 60
|
||||
reportHist(log10(d$run.time / min), head("Run time (log10[min])"))
|
||||
|
||||
reportCountingPlot(d$user.name, head("user"))
|
||||
reportCountingPlot(d$domain.name, head("Domain name"))
|
||||
#reportCountingPlot(d$host.name, head("host"))
|
||||
|
||||
reportCountingPlot(d$java, head("Java version"))
|
||||
#reportCountingPlot(d$machine, head("Machine"))
|
||||
#reportCountingPlot(d$working.directory, head("Working directory"))
|
||||
}
|
||||
|
||||
RUNME = T
|
||||
if ( RUNME ) {
|
||||
lastWeek = levels(cut(d$end.time, "weeks"))[-1]
|
||||
generateOneReport(d, "Overall")
|
||||
#generateOneReport(subset(d, end.time >= lastWeek), "Just last week to date", includeByWeek = F)
|
||||
}
|
||||
|
||||
if ( onCMDLine ) dev.off()
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,144 +0,0 @@
|
|||
args <- commandArgs(TRUE)
|
||||
docBase <- args[1]
|
||||
|
||||
## APPEND THE SUFFIXES ##
|
||||
|
||||
locusStats <- paste(docBase,".sample_locus_statistics",sep="")
|
||||
targetStats <- paste(docBase,".sample_interval_statistics",sep="")
|
||||
sampleSum <- paste(docBase,".sample_summary_statistics",sep="")
|
||||
sampleStats <- paste(docBase,".sample_statistics",sep="")
|
||||
targetSum <- paste(docBase,".sample_interval_summary",sep="")
|
||||
|
||||
## DEFINE THE PLOTTING FUNCTIONS ##
|
||||
|
||||
PlotDepths <- function(X) {
|
||||
pdf("Depth_Histogram_All_Samples.pdf")
|
||||
Y <- as.matrix(X)
|
||||
colors <- rainbow(nrow(Y),gamma=0.8)
|
||||
plot(Y[1,],col=colors[1],type="b",xlab="",xaxt="n",ylab="Number of Loci")
|
||||
axis(1,labels=FALSE)
|
||||
labels <- colnames(X)
|
||||
text(1:ncol(Y),par("usr")[3]-(100/6000)*par("usr")[4],srt=45,adj=1,labels=labels,xpd=TRUE,cex=0.7)
|
||||
for ( jj in 2:nrow(Y) ) {
|
||||
points(Y[jj,],col=colors[jj],type="b")
|
||||
}
|
||||
ymax = par("usr")[4]
|
||||
xmax = par("usr")[2]
|
||||
legend(y=0.95*ymax,x=0.8*xmax,col=colors,rownames(X),lty=c(1),cex=0.5)
|
||||
dev.off()
|
||||
}
|
||||
|
||||
PlotLocusQuantiles <- function(X) {
|
||||
pdf("Per_Sample_Coverage_Quantiles.pdf")
|
||||
Y <- as.matrix(X)
|
||||
Y <- Y/sum(Y[1,])
|
||||
Z <- matrix(nrow=nrow(Y),ncol=ncol(Y))
|
||||
for ( ii in 1:nrow(Y) ) {
|
||||
for ( jj in 1:ncol(Y) ) {
|
||||
# see how much density is in the remaining columns
|
||||
Z[ii,jj] = sum(Y[ii,jj:ncol(Y)])
|
||||
}
|
||||
}
|
||||
|
||||
medians = matrix(nrow=1,ncol=ncol(Z))
|
||||
quan90 = matrix(nrow=1,ncol=ncol(Z))
|
||||
for ( cc in 1:ncol(Z) ) {
|
||||
medians[cc] = quantile(Z[,cc],0.75)
|
||||
quan90[cc] = quantile(Z[,cc],1)
|
||||
}
|
||||
|
||||
plot(t(medians),xlab="",xaxt="n",ylab="Proportion of loci with >X coverage",type="b",col="blue",yaxp=c(0,1,10))
|
||||
axis(1,labels=FALSE)
|
||||
parseColNames <- function(K) {
|
||||
M = matrix(nrow=1,ncol=length(K))
|
||||
number = 0
|
||||
for ( lab in K ) {
|
||||
number = 1 + number
|
||||
g = unlist(strsplit(lab,split="_"))
|
||||
M[1,number] = g[2]
|
||||
}
|
||||
|
||||
return(M)
|
||||
}
|
||||
labels <- parseColNames(colnames(X))
|
||||
text(1:length(labels),par("usr")[3]-0.025,srt=90,adj=1,labels=labels,xpd=TRUE,cex=(0.8/32)*length(labels),lheight=(0.8/32)*length(labels))
|
||||
points(t(quan90),type="b",col="red")
|
||||
legend(x=floor(0.6*length(labels)),y=1,c("75% of samples","100% of samples"),col=c("red","blue"),lty=c(1,1))
|
||||
dev.off()
|
||||
}
|
||||
|
||||
HistogramMedians <- function(X) {
|
||||
pdf("Per_Sample_Median_Histogram.pdf")
|
||||
hist(as.numeric(as.matrix(unlist(X[1:nrow(X)-1,5]))),floor(nrow(X)/2),xlab="Median Coverage",ylab="Number of Samples", main="Median coverage acrosss samples",col="grey")
|
||||
dev.off()
|
||||
}
|
||||
|
||||
HeatmapLocusTable <- function(X) {
|
||||
pdf("Locus_Coverage_HeatMap.pdf")
|
||||
Y <- as.matrix(X)
|
||||
heatmap(Y,Rowv=NA,Colv=NA)
|
||||
dev.off()
|
||||
}
|
||||
|
||||
PlotMeanMedianQuartiles <- function(X) {
|
||||
pdf("Per_Sample_Mean_Quantile_Coverage.pdf")
|
||||
colors <- rainbow(4,start=0.6,end=0.9,gamma=1)
|
||||
means = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,3])))
|
||||
medians = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,5])))
|
||||
thirdQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,4])))
|
||||
firstQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,6])))
|
||||
plot(means,xlab="",ylab="Depth of Coverage",xaxt="n",col=colors[1],pch=3,type="b",ylim=c(0,max(thirdQ)))
|
||||
points(firstQ,col=colors[2],pch=2,type="b")
|
||||
points(medians,col=colors[3],pch=1,type="b")
|
||||
points(thirdQ,col=colors[4],pch=2,type="b")
|
||||
axis(1,labels=FALSE)
|
||||
labels <- X[1:nrow(X)-1,1]
|
||||
text(1:nrow(X)-1,par("usr")[3]-(50/2500)*par("usr")[4],srt=90,adj=1,labels=labels,xpd=TRUE,cex=0.5)
|
||||
text(5*nrow(X)/8,par("usr")[3]-(350/2500)*par("usr")[4],adj=1,labels="SAMPLE_ID",xpd=TRUE)
|
||||
legend(x=nrow(X)/10,y=par("usr")[4]-(200/2500)*par("usr")[4],c("Mean","25% Quantile","Median","75% Quantile"),col=colors,lty=c(1),cex=0.8,pch=c(3,2,1,2))
|
||||
dev.off()
|
||||
}
|
||||
|
||||
PlotOnlyMeanMedian <- function(X) {
|
||||
pdf("Per_Sample_Mean_Median_Only.pdf")
|
||||
colors <- rainbow(2,start=0.6,end=0.9,gamma=1)
|
||||
means = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,3])))
|
||||
medians = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,5])))
|
||||
plot(means,xlab="",ylab="Depth of Coverage",xaxt="n",col=colors[1],pch=3,type="b",ylim=c(0,max(c(max(means),max(medians)))))
|
||||
points(medians,col=colors[2],pch=1,type="b")
|
||||
axis(1,labels=FALSE)
|
||||
labels <- X[1:nrow(X)-1,1]
|
||||
text(1:nrow(X)-1,par("usr")[3]-(50/2500)*par("usr")[4],srt=90,adj=1,labels=labels,xpd=TRUE,cex=0.5)
|
||||
text(5*nrow(X)/8,par("usr")[3]-(350/2500)*par("usr")[4],adj=1,labels="SAMPLE_ID",xpd=TRUE)
|
||||
legend(x=nrow(X)/10,y=par("usr")[4]-(200/2500)*par("usr")[4],c("Mean","Median"),col=colors,lty=c(1),cex=0.8,pch=c(3,2))
|
||||
dev.off()
|
||||
}
|
||||
|
||||
PlotOnlyQuartiles <- function(X) {
|
||||
pdf("Per_Sample_Quartiles_Only.pdf")
|
||||
colors <- rainbow(2,start=0.6,end=0.9,gamma=1)
|
||||
thirdQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,4])))
|
||||
firstQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,6])))
|
||||
plot(thirdQ,xlab="",ylab="Depth of Coverage",xaxt="n",col=colors[1],pch=3,type="b",ylim=c(0,max(thirdQ)))
|
||||
points(firstQ,col=colors[2],pch=2,type="b")
|
||||
axis(1,labels=FALSE)
|
||||
labels <- X[1:nrow(X)-1,1]
|
||||
text(1:nrow(X)-1,par("usr")[3]-(50/2500)*par("usr")[4],srt=90,adj=1,labels=labels,xpd=TRUE,cex=0.5)
|
||||
text(5*nrow(X)/8,par("usr")[3]-(350/2500)*par("usr")[4],adj=1,labels="SAMPLE_ID",xpd=TRUE)
|
||||
legend(x=nrow(X)/10,y=par("usr")[4]-(200/2500)*par("usr")[4],c("75% Quantile","25% Quantile"),col=colors,lty=c(1),cex=0.8,pch=c(3,2))
|
||||
dev.off()
|
||||
}
|
||||
|
||||
## PLOT SAMPLE STATISTICS
|
||||
TO_PLOT <- read.table(sampleStats)
|
||||
PlotDepths(TO_PLOT)
|
||||
PlotLocusQuantiles(TO_PLOT)
|
||||
## PLOT SAMPLE SUMMARY
|
||||
TO_PLOT <- read.table(sampleSum,header=TRUE)
|
||||
PlotMeanMedianQuartiles(TO_PLOT)
|
||||
PlotOnlyMeanMedian(TO_PLOT)
|
||||
PlotOnlyQuartiles(TO_PLOT)
|
||||
HistogramMedians(TO_PLOT)
|
||||
## PLOT LOCUS STATISTICS
|
||||
TO_PLOT <- read.table(locusStats)
|
||||
HeatmapLocusTable(TO_PLOT)
|
||||
|
|
@ -1,254 +0,0 @@
|
|||
library(ellipse);
|
||||
library(hexbin);
|
||||
|
||||
getAnnIndex <- function(d, ann) {
|
||||
index = -1;
|
||||
for (i in c(1:length(names(d)))) {
|
||||
if (names(d)[i] == ann) {
|
||||
index = i;
|
||||
}
|
||||
}
|
||||
|
||||
index;
|
||||
}
|
||||
|
||||
getClusterAnnIndex <- function(c, ann) {
|
||||
index = -1;
|
||||
|
||||
for (i in c(1:length(c[[1]]$anns))) {
|
||||
if (c[[1]]$anns[i] == ann) {
|
||||
index = i;
|
||||
}
|
||||
}
|
||||
|
||||
index;
|
||||
}
|
||||
|
||||
plotAnn <- function(d.known, d.novel, d.loci, ann) {
|
||||
index = getAnnIndex(d.known, ann);
|
||||
|
||||
k = hist(d.known[,index], breaks=100, plot=FALSE);
|
||||
n = hist(d.novel[,index], breaks=100, plot=FALSE);
|
||||
|
||||
plot(k$mids, k$density, type="b", col="blue", ylim=c(0, max(k$density)), lwd=2, xlab=ann, ylab="Density", bty="n");
|
||||
points(n$mids, n$density, type="b", col="red", lwd=2);
|
||||
|
||||
if (!is.na(d.loci)) {
|
||||
legend("topright", c("Known", "Novel", "Suspicious loci"), col=c("blue", "red", "yellow3"), pch=c(21, 21, 18));
|
||||
} else {
|
||||
legend("topright", c("Known", "Novel"), col=c("blue", "red"), pch=21);
|
||||
}
|
||||
|
||||
if (!is.na(d.loci)) {
|
||||
for (i in c(1:nrow(d.loci))) {
|
||||
points(d.loci[i, index], 0, col="yellow3", pch=18, cex=2.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
read.clusters <- function(filename) {
|
||||
con = file(filename, "r", blocking = FALSE)
|
||||
lines = readLines(con)
|
||||
close(con);
|
||||
|
||||
anns = c();
|
||||
|
||||
annIndex = 1;
|
||||
clusterIndex = 1;
|
||||
clusters = c();
|
||||
|
||||
conversions = c();
|
||||
|
||||
for (line in lines) {
|
||||
if (length(grep("ANNOTATION", line)) > 0) {
|
||||
linePieces = unlist(strsplit(line, ","));
|
||||
|
||||
anns = c(anns, linePieces[2]);
|
||||
conversions[[annIndex]] = list(ann = linePieces[2], offset = as.numeric(linePieces[3]), multiplier = as.numeric(linePieces[4]));
|
||||
|
||||
annIndex = annIndex + 1;
|
||||
} else if (length(grep("CLUSTER", line)) > 0) {
|
||||
linePieces = unlist(strsplit(line, ","));
|
||||
|
||||
mixtureWeight = linePieces[2];
|
||||
mu = linePieces[3:(3+length(anns)-1)];
|
||||
cov = linePieces[(3+length(anns)):length(linePieces)];
|
||||
|
||||
clusters[[clusterIndex]] = list(
|
||||
anns = anns,
|
||||
conversions = conversions,
|
||||
mixtureWeight = as.numeric(mixtureWeight),
|
||||
means = as.numeric(mu),
|
||||
cov = matrix(cov, nrow=length(anns), ncol=length(anns))
|
||||
);
|
||||
clusterIndex = clusterIndex + 1;
|
||||
}
|
||||
}
|
||||
|
||||
clusters;
|
||||
}
|
||||
|
||||
clusterLimits <- function( vals, defaultMin, defaultMax ) {
|
||||
x = c(max(defaultMin, min(vals, -2)), min(defaultMax, max(vals, 2)))
|
||||
print(x)
|
||||
x
|
||||
}
|
||||
|
||||
getClusterColor <- function(clusterIndex, nClusters) {
|
||||
clusterColors(nClusters)[clusterIndex]
|
||||
}
|
||||
|
||||
clusterColors <- function(nClusters) {
|
||||
rainbow(nClusters)
|
||||
}
|
||||
|
||||
|
||||
makeAxis <- function( num, vals, off1, mult1, xmin, xmax ) {
|
||||
#labels=as.integer(seq(from=min(vals), to=max(vals), by=(abs(min(vals)) + abs(max(vals)))/5))
|
||||
#at=seq(from=min((vals - off1)/mult1), to=max((vals - off1)/mult1), by=(abs(min((vals - off1)/mult1)) + abs(max((vals - off1)/mult1)))/5)
|
||||
|
||||
#from = xmin * mult1 + off1
|
||||
#to = xmax * mult1 + off1
|
||||
#print(list(off1=off1, mult1=mult1, xmin=xmin, xmax=xmax))
|
||||
at = as.integer(seq(from=xmin, to=xmax, by=(abs(xmin) + abs(xmax))/5))
|
||||
labels = as.integer(at * mult1 + off1)
|
||||
#print(list(from=from, to=to, by=(abs(from) + abs(to))/5))
|
||||
#print(list(labels=labels, at=at))
|
||||
|
||||
axis(num, labels=labels, at=at);
|
||||
|
||||
# axis(num,
|
||||
# labels=as.integer(seq(from=min(vals), to=max(vals), by=(abs(min(vals)) + abs(max(vals)))/5)),
|
||||
# at=seq(from=min((vals - off1)/mult1), to=max((vals - off1)/mult1), by=(abs(min((vals - off1)/mult1)) + abs(max((vals - off1)/mult1)))/5)
|
||||
# );
|
||||
}
|
||||
|
||||
plotClusters <- function(d.known, d.novel, d.loci, c, ann1, ann2, filename, maxVariants = -1) {
|
||||
index1 = getAnnIndex(d.known, ann1);
|
||||
index2 = getAnnIndex(d.known, ann2);
|
||||
|
||||
cindex1 = getClusterAnnIndex(c, ann1);
|
||||
cindex2 = getClusterAnnIndex(c, ann2);
|
||||
|
||||
mult1 = c[[1]]$conversions[[cindex1]]$multiplier;
|
||||
off1 = c[[1]]$conversions[[cindex1]]$offset;
|
||||
|
||||
mult2 = c[[1]]$conversions[[cindex2]]$multiplier;
|
||||
off2 = c[[1]]$conversions[[cindex2]]$offset;
|
||||
|
||||
xvalsForLims = clusterLimits(d.known[,index1], -4, 4)
|
||||
yvalsForLims = clusterLimits(d.known[,index2], -4, 4)
|
||||
xlims = c(min(xvalsForLims), 1.2*max(xvalsForLims));
|
||||
ylims = c(min(yvalsForLims), max(yvalsForLims));
|
||||
|
||||
# par(mar=c(5, 6, 2, 5));
|
||||
plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=xlims, ylim=ylims, xlab=ann1, ylab=ann2, bty="n");
|
||||
|
||||
mv.known = if (maxVariants == -1 | maxVariants >= nrow(d.known)) { seq(1, nrow(d.known)) } else { as.integer(runif(maxVariants, 1, nrow(d.known)+1))}
|
||||
mv.novel = if (maxVariants == -1 | maxVariants >= nrow(d.novel)) { 1:nrow(d.novel) } else { as.integer(runif(maxVariants, 1, nrow(d.novel)+1)) }
|
||||
|
||||
print(dim(mv.known))
|
||||
print(maxVariants)
|
||||
|
||||
points(((d.known[,index1] - off1)/mult1)[mv.known], ((d.known[,index2] - off2)/mult2)[mv.known], pch=19, cex=0.3, col="#0000FF33");
|
||||
points(((d.novel[,index1] - off1)/mult1)[mv.novel], ((d.novel[,index2] - off2)/mult2)[mv.novel], pch=19, cex=0.3, col="#FF000033");
|
||||
|
||||
nClusters = length(c)
|
||||
for (clusterIndex in c(1:nClusters)) {
|
||||
mu = c(c[[clusterIndex]]$means[cindex1], c[[clusterIndex]]$means[cindex2]);
|
||||
cov = matrix(as.numeric(
|
||||
matrix(
|
||||
c(
|
||||
c[[clusterIndex]]$cov[cindex1,cindex1],
|
||||
c[[clusterIndex]]$cov[cindex2,cindex1],
|
||||
c[[clusterIndex]]$cov[cindex1,cindex2],
|
||||
c[[clusterIndex]]$cov[cindex2,cindex2]
|
||||
),
|
||||
nrow=2, ncol=2
|
||||
)
|
||||
), nrow=2, ncol=2
|
||||
);
|
||||
|
||||
weight = c[[clusterIndex]]$mixtureWeight;
|
||||
color = getClusterColor(clusterIndex, nClusters);
|
||||
lineweight = ifelse(weight > 0.50, 4, 3);
|
||||
|
||||
points(mu[1], mu[2], pch=21, col=color, cex=0.5);
|
||||
points(ellipse(t(cov), centre=mu), type="l", lwd=lineweight, col=color);
|
||||
}
|
||||
|
||||
makeAxis(1, d.novel[,index1], off1, mult1, xvalsForLims[1], xvalsForLims[2])
|
||||
makeAxis(2, d.novel[,index2], off2, mult2, yvalsForLims[1], yvalsForLims[2])
|
||||
|
||||
# add points legend on the lower left
|
||||
if (!is.na(d.loci)) {
|
||||
legend("bottomleft", c("Known", "Novel", "Suspicious loci"), col=c("blue", "red", "yellow3"), pch=19);
|
||||
} else {
|
||||
legend("bottomleft", c("Known", "Novel"), col=c("blue", "red"), pch=19);
|
||||
}
|
||||
|
||||
# add upper right legend with cluster id and weights
|
||||
weights = round(sapply(c, function(x) x$mixtureWeight),2)
|
||||
clusterNames = paste("C", paste(1:nClusters), sep="")
|
||||
clusterLegendNames = paste(clusterNames, weights, sep="-W=")
|
||||
legend("topright", clusterLegendNames, fill=clusterColors(nClusters))
|
||||
|
||||
if (!is.na(d.loci)) {
|
||||
points((d.loci[,index1] - off1)/mult1, (d.loci[,index2] - off2)/mult2, pch=19, cex=0.8, col="yellow3");
|
||||
}
|
||||
}
|
||||
|
||||
args = commandArgs(TRUE);
|
||||
|
||||
plotRoot = args[1];
|
||||
if (is.na(plotRoot)) { plotRoot = "test"; }
|
||||
|
||||
clusterFile = args[2];
|
||||
if (is.na(clusterFile)) { clusterFile = "/Volumes/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v8/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized"; }
|
||||
|
||||
vcfTable = args[3];
|
||||
if (is.na(vcfTable)) { vcfTable = "/Volumes/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v8/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized.table"; }
|
||||
|
||||
lociFile = args[4];
|
||||
if (is.na(lociFile) | lociFile == "NA" ) { lociFile = NA; }
|
||||
|
||||
maxVariants = args[5];
|
||||
if (is.na(maxVariants)) { maxVariants = 5000; }
|
||||
maxVariants = as.integer(maxVariants)
|
||||
|
||||
greedy = args[6]
|
||||
if (is.na(greedy)) { greedy = -1; }
|
||||
greedy = as.integer(greedy)
|
||||
|
||||
l = c();
|
||||
if (!is.na(lociFile)) {
|
||||
t = read.table(lociFile, header=TRUE);
|
||||
l = t$POS;
|
||||
}
|
||||
|
||||
print("Greedy reading")
|
||||
d = read.table(vcfTable, header=TRUE, nrows = greedy);
|
||||
c = read.clusters(clusterFile);
|
||||
|
||||
d.known = d[which(d$DB == 1 | d$ID != "."),];
|
||||
d.novel = d[which(d$DB == 0 | d$ID == "."),];
|
||||
d.loci = NA;
|
||||
if (length(l) > 0) {
|
||||
d.loci = d[which(d$POS %in% l),];
|
||||
}
|
||||
|
||||
pdf(paste(plotRoot, ".clusterReport.pdf", sep=""));
|
||||
|
||||
for (ann1 in c[[1]]$anns) {
|
||||
print(ann1)
|
||||
plotAnn(d.known, d.novel, d.loci, ann1);
|
||||
|
||||
for (ann2 in c[[1]]$anns) {
|
||||
if (ann1 != ann2) {
|
||||
print(paste("-- v ", ann2))
|
||||
plotClusters(d.known, d.novel, d.loci, c, ann1, ann2, maxVariants=maxVariants);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dev.off();
|
||||
|
|
@ -1,442 +0,0 @@
|
|||
suppressPackageStartupMessages(library(gsalib));
|
||||
suppressPackageStartupMessages(library(gplots));
|
||||
|
||||
eval.getMetrics <- function(eval, jexl_expression) {
|
||||
callset.counts = eval$CountVariants[which(eval$CountVariants$evaluation_name == "eval" & eval$CountVariants$comparison_name == "dbsnp" & eval$CountVariants$jexl_expression == jexl_expression),];
|
||||
callset.counts.titv = eval$TiTv[which(eval$TiTv$evaluation_name == "eval" & eval$TiTv$comparison_name == "dbsnp" & eval$TiTv$jexl_expression == jexl_expression),];
|
||||
|
||||
callset.calledCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "all"),]$nVariantLoci;
|
||||
callset.calledCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio;
|
||||
|
||||
callset.knownCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "known"),]$nVariantLoci;
|
||||
callset.knownCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio;
|
||||
|
||||
callset.novelCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "novel"),]$nVariantLoci;
|
||||
callset.novelCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio;
|
||||
|
||||
callset.allFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "all"),]$nVariantLoci;
|
||||
callset.allFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio;
|
||||
|
||||
callset.knownFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "known"),]$nVariantLoci;
|
||||
callset.knownFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio;
|
||||
|
||||
callset.novelFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "novel"),]$nVariantLoci;
|
||||
callset.novelFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio;
|
||||
|
||||
metrics = list(
|
||||
all = callset.calledCounts,
|
||||
all.titv = callset.calledCounts.titv,
|
||||
|
||||
known = callset.knownCounts,
|
||||
known.titv = callset.knownCounts.titv,
|
||||
|
||||
novel = callset.novelCounts,
|
||||
novel.titv = callset.novelCounts.titv,
|
||||
|
||||
filtered.all = callset.allFilteredCounts,
|
||||
filtered.all.titv = callset.allFilteredCounts.titv,
|
||||
|
||||
filtered.known = callset.knownFilteredCounts,
|
||||
filtered.known.titv = callset.knownFilteredCounts.titv,
|
||||
|
||||
filtered.novel = callset.novelFilteredCounts,
|
||||
filtered.novel.titv = callset.novelFilteredCounts.titv
|
||||
);
|
||||
}
|
||||
|
||||
.plot.callsetConcordance.getLabelText <- function(name, othername, metrics, filtered.metrics=NA, union) {
|
||||
if (is.na(filtered.metrics)) {
|
||||
text = sprintf("%s (%0.01f%% of union)\nCalled:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f",
|
||||
name, 100*metrics$all/union$all.withfiltered,
|
||||
metrics$all, metrics$all.titv,
|
||||
metrics$known, metrics$known.titv,
|
||||
metrics$novel, metrics$novel.titv
|
||||
);
|
||||
} else {
|
||||
text = sprintf("%s (%0.01f%% of union)\nCalled in %s, filtered in %s:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f\n\nCalled in %s, absent in %s:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f",
|
||||
name, 100*(metrics$all + filtered.metrics$all)/union$all.withfiltered,
|
||||
|
||||
name, othername,
|
||||
filtered.metrics$all, filtered.metrics$all.titv,
|
||||
filtered.metrics$known, filtered.metrics$known.titv,
|
||||
filtered.metrics$novel, filtered.metrics$novel.titv,
|
||||
|
||||
name, othername,
|
||||
metrics$all, metrics$all.titv,
|
||||
metrics$known, metrics$known.titv,
|
||||
metrics$novel, metrics$novel.titv
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
plot.titlePage <- function(title, author) {
|
||||
textplot(sprintf("Automated Variant Report\n\n%s\n%s\n%s\n", title, author, Sys.Date()));
|
||||
}
|
||||
|
||||
.plot.variantTable.getRowText <- function(eval, jexl_expression) {
|
||||
allVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "all"),]$nVariantLoci;
|
||||
knownVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "known"),]$nVariantLoci;
|
||||
novelVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "novel"),]$nVariantLoci;
|
||||
|
||||
allTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "all"),]$ti.tv_ratio;
|
||||
knownTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "known"),]$ti.tv_ratio;
|
||||
novelTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "novel"),]$ti.tv_ratio;
|
||||
|
||||
cbind(allVariants, knownVariants, sprintf("%0.2f", knownTiTv), novelVariants, sprintf("%0.2f", novelTiTv));
|
||||
}
|
||||
|
||||
plot.variantTable <- function(eval, title) {
|
||||
aonly.row = .plot.variantTable.getRowText(eval, eval$CallsetOnlyNames[1]);
|
||||
aonly.filtered.row = .plot.variantTable.getRowText(eval, eval$CallsetFilteredNames[1]);
|
||||
intersection.row = .plot.variantTable.getRowText(eval, "Intersection");
|
||||
bonly.row = .plot.variantTable.getRowText(eval, eval$CallsetOnlyNames[2]);
|
||||
bonly.filtered.row = .plot.variantTable.getRowText(eval, eval$CallsetFilteredNames[2]);
|
||||
|
||||
variantsummary = as.data.frame(rbind(bonly.row, bonly.filtered.row, intersection.row, aonly.filtered.row, aonly.row));
|
||||
|
||||
rownames(variantsummary) = c(
|
||||
sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
|
||||
sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
|
||||
"Intersection",
|
||||
sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]),
|
||||
sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2])
|
||||
);
|
||||
colnames(variantsummary) = c("counts (all)", "counts (known)", "ti/tv (known)", "counts (novel)", "ti/tv (novel)");
|
||||
|
||||
textplot(variantsummary);
|
||||
}
|
||||
|
||||
plot.callsetConcordance <- function(eval, col=c("#FF6342", "#63C6DE", "#ADDE63")) {
|
||||
aonly = eval.getMetrics(eval, eval$CallsetOnlyNames[1]);
|
||||
aonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[1]);
|
||||
intersection = eval.getMetrics(eval, "Intersection");
|
||||
bonly = eval.getMetrics(eval, eval$CallsetOnlyNames[2]);
|
||||
bonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[2]);
|
||||
|
||||
union = list(
|
||||
all = intersection$all + aonly$all + bonly$all,
|
||||
all.withfiltered = intersection$all + aonly$all + bonly$all + aonly.filtered$all + bonly.filtered$all
|
||||
);
|
||||
|
||||
gsa.plot.venn(aonly$all + intersection$all + aonly.filtered$all, bonly$all + intersection$all + bonly.filtered$all, 0, intersection$all, 0, 0, pos=c(0.32, 0.32, 0.68, 0.70), col=col);
|
||||
|
||||
text(0, 0.45, cex=1.2, pos=4, .plot.callsetConcordance.getLabelText(eval$CallsetNames[1], eval$CallsetNames[2], aonly, aonly.filtered, union));
|
||||
text(0.5, 0.75, cex=1.2, adj=c(0.5, 0.33), .plot.callsetConcordance.getLabelText("Intersection", NA, intersection, NA, union));
|
||||
text(1, 0.45, cex=1.2, pos=2, .plot.callsetConcordance.getLabelText(eval$CallsetNames[2], eval$CallsetNames[1], bonly, bonly.filtered, union));
|
||||
}
|
||||
|
||||
plot.callsetConcordanceByAC <- function(eval, normalize=TRUE, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) {
|
||||
aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name);
|
||||
aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]);
|
||||
intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name);
|
||||
bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name);
|
||||
bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]);
|
||||
|
||||
title = paste("Callset concordance per allele count (", novelty_name, " variants)", sep="");
|
||||
|
||||
if (length(intersection$AC) > 0 && length(aonly$AC) == 0) {
|
||||
aonly = intersection;
|
||||
aonly$n = 0;
|
||||
}
|
||||
|
||||
if (length(intersection$AC) > 0 && length(bonly$AC) == 0) {
|
||||
bonly = intersection;
|
||||
bonly$n = 0;
|
||||
}
|
||||
|
||||
if (length(intersection$AC) > 0 && length(aonly.filtered$AC) == 0) {
|
||||
aonly.filtered = intersection;
|
||||
aonly.filtered$n = 0;
|
||||
}
|
||||
|
||||
if (length(intersection$AC) > 0 && length(bonly.filtered$AC) == 0) {
|
||||
bonly.filtered = intersection;
|
||||
bonly.filtered$n = 0;
|
||||
}
|
||||
|
||||
#par.def = par(no.readonly = TRUE);
|
||||
#par(mar=c(5, 5, 3, 5));
|
||||
|
||||
if (normalize == TRUE) {
|
||||
norm = aonly$n + aonly.filtered$n + intersection$n + bonly$n + bonly.filtered$n;
|
||||
matnorm = rbind(aonly$n/norm, aonly.filtered$n/norm, intersection$n/norm, bonly.filtered$n/norm, bonly$n/norm);
|
||||
|
||||
barplot(matnorm, col=col, xlab="Allele count", ylab="", main=title, names.arg=intersection$AC, xlim=c(1, 1.2*max(intersection$AC)), ylim=c(0, 1.3), border=NA, yaxt="n", cex=1.3, cex.axis=1.3, cex.lab=1.3);
|
||||
axis(2, at=seq(from=0, to=1, by=0.2), seq(from=0, to=1, by=0.2), cex=1.3, cex.axis=1.3);
|
||||
mtext("Fraction", side=2, at=0.5, padj=-3.0, cex=1.3);
|
||||
} else {
|
||||
mat = rbind(aonly$n, aonly.filtered$n, intersection$n, bonly.filtered$n, bonly$n);
|
||||
|
||||
#barplot(mat, col=col, xlab="Allele count", ylab="counts", main=title, names.arg=intersection$AC, xlim=c(1, max(intersection$AC)), ylim=c(0, 1), border=NA, cex=1.3, cex.axis=1.3, cex.lab=1.3);
|
||||
|
||||
barplot(mat, col=col, xlab="Allele count", ylab="counts", main=title, names.arg=intersection$AC, xlim=c(1, 1.2*max(intersection$AC)), border=NA, cex=1.3, cex.axis=1.3, cex.lab=1.3);
|
||||
#axis(2, at=seq(from=0, to=1, by=0.2), seq(from=0, to=1, by=0.2), cex=1.3, cex.axis=1.3);
|
||||
#mtext("Fraction", side=2, at=0.5, padj=-3.0, cex=1.3);
|
||||
}
|
||||
|
||||
legend(
|
||||
"topright",
|
||||
c(
|
||||
sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
|
||||
sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
|
||||
"Intersection",
|
||||
sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]),
|
||||
sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2])
|
||||
),
|
||||
fill=rev(col),
|
||||
cex=1.3
|
||||
);
|
||||
|
||||
#par(par.def);
|
||||
}
|
||||
|
||||
plot.alleleCountSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) {
|
||||
aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name);
|
||||
aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]);
|
||||
intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name);
|
||||
intersection.all = eval.getMetrics(eval, "Intersection");
|
||||
bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name);
|
||||
bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]);
|
||||
|
||||
title = paste("Allele count spectrum (", novelty_name, " variants)", sep="");
|
||||
|
||||
if (length(intersection$AC) > 0 && length(aonly$AC) == 0) {
|
||||
aonly = intersection;
|
||||
aonly$n = 0;
|
||||
}
|
||||
|
||||
if (length(intersection$AC) > 0 && length(bonly$AC) == 0) {
|
||||
bonly = intersection;
|
||||
bonly$n = 0;
|
||||
}
|
||||
|
||||
if (length(intersection$AC) > 0 && length(aonly.filtered$AC) == 0) {
|
||||
aonly.filtered = intersection;
|
||||
aonly.filtered$n = 0;
|
||||
}
|
||||
|
||||
if (length(intersection$AC) > 0 && length(bonly.filtered$AC) == 0) {
|
||||
bonly.filtered = intersection;
|
||||
bonly.filtered$n = 0;
|
||||
}
|
||||
|
||||
loci = (unique(eval$CountVariants$nProcessedLoci))[1];
|
||||
ymax = 10*max((1/1000)*loci*(1/c(1:max(intersection$AC))));
|
||||
|
||||
suppressWarnings(plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(1, ymax), xlab="Allele count", ylab="Number of variants", main=title, log="xy", bty="n", cex=1.3, cex.lab=1.3, cex.axis=1.3));
|
||||
suppressWarnings(points(intersection$AC, aonly$n + aonly.filtered$n + intersection$n, type="l", lwd=2, col=col[1]));
|
||||
suppressWarnings(points(intersection$AC, aonly$n + intersection$n, type="l", lwd=2, lty=2, col=col[1]));
|
||||
suppressWarnings(points(intersection$AC, intersection$n, type="l", lwd=2, col=col[3]));
|
||||
suppressWarnings(points(intersection$AC, bonly$n + intersection$n, type="l", lwd=2, lty=2, col=col[4]));
|
||||
suppressWarnings(points(intersection$AC, bonly$n + bonly.filtered$n + intersection$n, type="l", lwd=2, col=col[5]));
|
||||
|
||||
#points(c(1:max(intersection$AC)), 0.9*(1/1000)*loci*(1/c(1:max(intersection$AC))), type="l", lwd=2, lty=2, col="black");
|
||||
|
||||
legend(
|
||||
"bottomleft",
|
||||
c(
|
||||
sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
|
||||
sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
|
||||
"Intersection",
|
||||
sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]),
|
||||
sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2])#,
|
||||
#sprintf("Neutral expectation ( 0.9*(1/1000)*%0.1f*(1/c(1:max(%d))) )", loci, max(intersection$AC))
|
||||
),
|
||||
lwd=c(2, 2, 3, 2, 2, 2),
|
||||
lty=c(1, 2, 1, 2, 1, 2),
|
||||
col=c(rev(col), "black"),
|
||||
cex=1.3
|
||||
);
|
||||
}
|
||||
|
||||
eval.getMetricsByAc <- function(eval, jexl, novelty="all") {
|
||||
piece = subset(eval$MetricsByAc,
|
||||
evaluation_name == "eval" &
|
||||
comparison_name == "dbsnp" &
|
||||
as.character(jexl_expression) == as.character(jexl) &
|
||||
filter_name == "called" &
|
||||
novelty_name == novelty
|
||||
);
|
||||
}
|
||||
|
||||
plot.titvSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) {
|
||||
aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name);
|
||||
aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]);
|
||||
intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name);
|
||||
bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name);
|
||||
bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]);
|
||||
|
||||
title = paste("Ti/Tv spectrum (", novelty_name, " variants)", sep="");
|
||||
|
||||
if (length(intersection$AC) > 0 && length(aonly$AC) == 0) {
|
||||
aonly = intersection;
|
||||
aonly$n = 0;
|
||||
aonly$nTi = 0;
|
||||
aonly$nTv = 0;
|
||||
}
|
||||
|
||||
if (length(intersection$AC) > 0 && length(bonly$AC) == 0) {
|
||||
bonly = intersection;
|
||||
bonly$n = 0;
|
||||
bonly$nTi = 0;
|
||||
bonly$nTv = 0;
|
||||
}
|
||||
|
||||
if (length(intersection$AC) > 0 && length(aonly.filtered$AC) == 0) {
|
||||
aonly.filtered = intersection;
|
||||
aonly.filtered$n = 0;
|
||||
aonly.filtered$nTi = 0;
|
||||
aonly.filtered$nTv = 0;
|
||||
}
|
||||
|
||||
if (length(intersection$AC) > 0 && length(bonly.filtered$AC) == 0) {
|
||||
bonly.filtered = intersection;
|
||||
bonly.filtered$n = 0;
|
||||
bonly.filtered$nTi = 0;
|
||||
bonly.filtered$nTv = 0;
|
||||
}
|
||||
|
||||
titv.aonly.withfiltered = (aonly$nTi + aonly.filtered$nTi + intersection$nTi)/(aonly$nTv + aonly.filtered$nTv + intersection$nTv);
|
||||
titv.aonly.withfiltered.finite = titv.aonly.withfiltered[which(is.finite(titv.aonly.withfiltered))];
|
||||
|
||||
titv.aonly = (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv);
|
||||
titv.aonly.finite = titv.aonly[which(is.finite(titv.aonly))];
|
||||
|
||||
titv.intersection.finite = intersection$Ti.Tv[which(is.finite(intersection$Ti.Tv))];
|
||||
|
||||
titv.bonly = (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv);
|
||||
titv.bonly.finite = titv.bonly[which(is.finite(titv.bonly))];
|
||||
|
||||
titv.bonly.withfiltered = (bonly$nTi + bonly.filtered$nTi + intersection$nTi)/(bonly$nTv + bonly.filtered$nTv + intersection$nTv);
|
||||
titv.bonly.withfiltered.finite = titv.bonly.withfiltered[which(is.finite(titv.bonly.withfiltered))];
|
||||
|
||||
titv.min = min(titv.aonly.withfiltered.finite, titv.aonly.finite, titv.intersection.finite, titv.bonly.finite, titv.bonly.withfiltered.finite);
|
||||
titv.max = max(titv.aonly.withfiltered.finite, titv.aonly.finite, titv.intersection.finite, titv.bonly.finite, titv.bonly.withfiltered.finite);
|
||||
|
||||
plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(0, 4), xlab="Allele count", ylab="Transition/transversion (Ti/Tv) ratio", main=title, bty="n", cex=1.3, cex.lab=1.3, cex.axis=1.3);
|
||||
points(intersection$AC, (aonly.filtered$nTi + intersection$nTi)/(aonly.filtered$nTv + intersection$nTv), type="l", lwd=2, col=col[1]);
|
||||
points(intersection$AC, (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv), type="l", lwd=2, lty=2, col=col[2]);
|
||||
points(intersection$AC, intersection$Ti.Tv, type="l", lwd=2, col=col[3]);
|
||||
points(intersection$AC, (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv), type="l", lwd=2, lty=2, col=col[4]);
|
||||
points(intersection$AC, (bonly.filtered$nTi + intersection$nTi)/(bonly.filtered$nTv + intersection$nTv), type="l", lwd=2, col=col[5]);
|
||||
|
||||
abline(h=2.3, lty=2);
|
||||
mtext("2.3", side=4, at=2.3, cex=0.9);
|
||||
|
||||
abline(h=3.3, lty=2);
|
||||
mtext("3.3", side=4, at=3.3, cex=0.9);
|
||||
|
||||
#legend("topleft", c(eval$CallsetOnlyNames[1], "Intersection", eval$CallsetOnlyNames[2]), fill=col);
|
||||
|
||||
legend(
|
||||
"topleft",
|
||||
c(
|
||||
sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
|
||||
sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
|
||||
"Intersection",
|
||||
sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]),
|
||||
sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2])
|
||||
),
|
||||
lwd=c(2, 2, 3, 2, 2),
|
||||
lty=c(1, 2, 1, 2, 1),
|
||||
col=rev(col),
|
||||
cex=1.3
|
||||
);
|
||||
}
|
||||
|
||||
plot.variantsPerSample2 <- function(eval) {
|
||||
if (!is.na(eval$MetricsBySample)) {
|
||||
metrics.all = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "all"),];
|
||||
metrics.known = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "known"),];
|
||||
metrics.novel = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "novel"),];
|
||||
|
||||
title = "Calls per sample";
|
||||
indices = order(metrics.all$nVariants, decreasing=TRUE);
|
||||
|
||||
plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics.all$sample)), ylim=c(0, max(metrics.all$nVariants)), xlab="", ylab="Number of variants", main=title, bty="n");
|
||||
points(c(1:length(metrics.all$sample)), (metrics.all$nVariants)[indices], pch=21, col="black");
|
||||
points(c(1:length(metrics.known$sample)), (metrics.known$nVariants)[indices], pch=21, col="blue");
|
||||
points(c(1:length(metrics.novel$sample)), (metrics.novel$nVariants)[indices], pch=21, col="red");
|
||||
|
||||
legend("topright", c("All", "Known", "Novel"), pch=21, col=c("black", "blue", "red"));
|
||||
|
||||
axis(1, at=c(1:length(metrics.all$sample)), labels=(metrics.all$sample)[indices], las=2, cex.axis=0.4);
|
||||
}
|
||||
}
|
||||
|
||||
plot.variantsPerSample <- function(eval, novelty_name="all") {
|
||||
if (!is.na(eval$SimpleMetricsBySample)) {
|
||||
metrics = eval$SimpleMetricsBySample[which(eval$SimpleMetricsBySample$evaluation_name == "eval" & eval$SimpleMetricsBySample$comparison_name == "dbsnp" & as.character(eval$SimpleMetricsBySample$jexl_expression) == "none" & eval$SimpleMetricsBySample$filter_name == "called" & eval$SimpleMetricsBySample$novelty_name == novelty_name),];
|
||||
|
||||
title = paste("Calls per sample (", novelty_name, ")", sep="");
|
||||
indices = order(metrics$CountVariants, decreasing=TRUE);
|
||||
|
||||
par.def = par(no.readonly = TRUE);
|
||||
par(mar=c(5, 4, 4, 4));
|
||||
|
||||
plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics$row)), ylim=c(0, max(metrics$CountVariants)), xlab="", ylab="Number of variants", main=title, bty="n");
|
||||
points(c(1:length(metrics$row)), (metrics$CountVariants)[indices], pch=21, col="black");
|
||||
|
||||
axis(1, at=c(1:length(metrics$row)), labels=(metrics$row)[indices], las=2, cex.axis=0.4);
|
||||
|
||||
par(new=TRUE);
|
||||
plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=c(1, length(metrics$row)), ylim=c(min(metrics$TiTvRatio), 1.2*max(metrics$TiTvRatio)), xlab="", ylab="", main=title, bty="n");
|
||||
points(c(1:length(metrics$row)), (metrics$TiTvRatio)[indices], pch=19, col="black");
|
||||
|
||||
titvaxis = c(min(metrics$TiTvRatio), max(metrics$TiTvRatio));
|
||||
axis(4, at=titvaxis, labels=titvaxis, las=2);
|
||||
|
||||
par(par.def);
|
||||
}
|
||||
}
|
||||
|
||||
argspec = list(
|
||||
evalRoot = list(value = NA, doc = "Path to the VariantEval R-output (omit the '.Analysis_Type.csv' part of the filename)"),
|
||||
plotOut = list(value = NA, doc = "Path to the output PDF file"),
|
||||
title = list(value = NA, doc = "The title of the report"),
|
||||
author = list(value = NA, doc = "The author of the report")
|
||||
);
|
||||
|
||||
cmdargs = gsa.getargs(argspec, doc="Take VariantEval R-output and generate a series of plots summarizing the contents");
|
||||
|
||||
eval = gsa.read.eval(cmdargs$evalRoot);
|
||||
|
||||
pdf(cmdargs$plotOut, width=10, height=10);
|
||||
|
||||
plot.titlePage(cmdargs$title, cmdargs$author);
|
||||
|
||||
plot.variantTable(eval);
|
||||
|
||||
if (length(eval$CallsetNames) > 0) {
|
||||
# Venn diagram
|
||||
plot.callsetConcordance(eval);
|
||||
|
||||
# Venn by AC (normalized)
|
||||
plot.callsetConcordanceByAC(eval, novelty_name="all");
|
||||
plot.callsetConcordanceByAC(eval, novelty_name="known");
|
||||
plot.callsetConcordanceByAC(eval, novelty_name="novel");
|
||||
|
||||
# Venn by AC (unnormalized)
|
||||
plot.callsetConcordanceByAC(eval, novelty_name="all", normalize=FALSE);
|
||||
plot.callsetConcordanceByAC(eval, novelty_name="known", normalize=FALSE);
|
||||
plot.callsetConcordanceByAC(eval, novelty_name="novel", normalize=FALSE);
|
||||
|
||||
# Allele count spectrum
|
||||
plot.alleleCountSpectrum(eval, novelty_name="all");
|
||||
plot.alleleCountSpectrum(eval, novelty_name="known");
|
||||
plot.alleleCountSpectrum(eval, novelty_name="novel");
|
||||
|
||||
# Ti/Tv spectrum
|
||||
plot.titvSpectrum(eval, novelty_name="all");
|
||||
plot.titvSpectrum(eval, novelty_name="known");
|
||||
plot.titvSpectrum(eval, novelty_name="novel");
|
||||
|
||||
# Per-sample
|
||||
#plot.variantsPerSample(eval);
|
||||
} else {
|
||||
#plot.variantsPerSample(eval, novelty_name="all");
|
||||
#plot.variantsPerSample(eval, novelty_name="known");
|
||||
#plot.variantsPerSample(eval, novelty_name="novel");
|
||||
}
|
||||
|
||||
dev.off();
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
#!/usr/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
base_name = args[1]
|
||||
input = args[2]
|
||||
|
||||
d <- read.table(input, header=T)
|
||||
# separate the data into filtered and unfiltered
|
||||
|
||||
d.filtered <- d[d$filter_type=="filtered",]
|
||||
d.unfiltered <- d[d$filter_type=="unfiltered",]
|
||||
|
||||
if (nrow(d.filtered) > 0) {
|
||||
d.display <- d.filtered
|
||||
} else {
|
||||
d.display <- d.unfiltered
|
||||
}
|
||||
|
||||
#
|
||||
# Plot histograms of the known versus novel Ti/Tv
|
||||
#
|
||||
|
||||
outfile = paste(base_name, ".histograms.png", sep="")
|
||||
|
||||
if (nrow(d.filtered) > 0) {
|
||||
nFilterTypes <- 2
|
||||
} else {
|
||||
nFilterTypes <- 1
|
||||
}
|
||||
|
||||
bitmap(outfile, width=600, height=(300 * nFilterTypes), units="px")
|
||||
par(cex=1.1, mfrow=c(1 * nFilterTypes,2))
|
||||
nbreaks <- 20
|
||||
color <- "grey"
|
||||
xlim <- c(0,4)
|
||||
|
||||
hist(d.unfiltered$known_titv, nbreaks, col=color, xlim=xlim)
|
||||
hist(d.unfiltered$novel_titv, nbreaks, col=color, xlim=xlim)
|
||||
|
||||
if (nrow(d.filtered) > 0) {
|
||||
hist(d.filtered$known_titv, nbreaks, col=color, xlim=xlim)
|
||||
hist(d.filtered$novel_titv, nbreaks, col=color, xlim=xlim)
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot samples in order of novel Ti/Tv versus known Ti/Tv
|
||||
#
|
||||
|
||||
outfile = paste(base_name, ".novel_vs_known_titv.png", sep="")
|
||||
|
||||
bitmap(outfile, width=600, height=600, units="px")
|
||||
|
||||
d.display <- d.display[order(d.display$novel_titv),]
|
||||
plot(1:length(d.display$known_titv),d.display$known_titv,type="b",col="blue",ylim=c(0,4), xlab="Sample #", ylab="Ti / Tv")
|
||||
points(1:length(d.display$novel_titv),d.display$novel_titv,type="b",col="red",ylim=c(0,4))
|
||||
legend("bottomright", c("known","novel"), col=c("blue","red"), pch=21)
|
||||
|
||||
dev.off()
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
d = read.table(args[1],head=T)
|
||||
outfile = args[2]
|
||||
title = args[3]
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
# plot timing
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
pdf(outfile, height=5, width=8)
|
||||
boxplot(d$walltime ~ d$operation, ylab = "Elapsed wall time in seconds [Log10 Scale]", log="y", main=title, cex.axis=0.75)
|
||||
dev.off()
|
||||
|
|
@ -1,72 +0,0 @@
|
|||
require("plotrix")
|
||||
args = commandArgs(TRUE);
|
||||
|
||||
onCMDLine = ! is.na(args[1])
|
||||
|
||||
file = "sim_calls.table"
|
||||
info = "interactive R"
|
||||
if ( onCMDLine ) {
|
||||
file = args[1]
|
||||
d <- read.table(file, header=T)
|
||||
pdf(args[2])
|
||||
info = args[3]
|
||||
}
|
||||
|
||||
d$sim.VAR <- d$sim.AC > 0
|
||||
d$called.VAR <- d$called.AC > 0
|
||||
|
||||
QS = unique(d$sim.Q)
|
||||
MODES = unique(d$sim.MODE)
|
||||
NS = unique(d$called.AN / 2)
|
||||
DEPTHS = unique(d$sim.DP)
|
||||
|
||||
addSection <- function(name) {
|
||||
par("mar", c(5, 4, 4, 2))
|
||||
frame()
|
||||
title(name, cex=2)
|
||||
}
|
||||
|
||||
addSection(paste("Calling performance report: nSamples = ", NS, "\n info:", info))
|
||||
|
||||
results <- expand.grid(Q = QS, mode = MODES, nSamples = NS, depth = DEPTHS)
|
||||
results$sensitivity = 0
|
||||
results$specificity = 0
|
||||
|
||||
determineRates <- function(raw, Q, mode, depth) {
|
||||
sub <- subset(raw, sim.Q == Q & sim.MODE == mode & sim.DP == depth)
|
||||
print(c(Q,mode,depth, dim(sub)))
|
||||
ct <- table(sub$called.VAR, sub$sim.VAR, dnn = c("called.VAR", "sim.VAR"), useNA = "always")
|
||||
print(ct)
|
||||
sensitivity = ct[2,2] / sum(ct[,2])
|
||||
specificity = ct[1,1] / sum(ct[,1])
|
||||
list(sensitivity = sensitivity, specificity = specificity, ct = ct)
|
||||
}
|
||||
|
||||
for ( i in 1:(dim(results)[1]) ) {
|
||||
r <- results[i,]
|
||||
x <- determineRates(d, r$Q, r$mode, r$depth)
|
||||
results[i,]$sensitivity = x$sensitivity
|
||||
results[i,]$specificity = x$specificity
|
||||
}
|
||||
|
||||
for ( depth in DEPTHS ) {
|
||||
boxplot(called.AC ~ sim.AC, data = subset(d, called.DP == depth * NS), main = paste("Depth of coverage ", depth), xlab = "Simulation AC", ylab = "Called AC", outwex=0.5, col = "cornflowerblue")
|
||||
abline(a=0,b=1,col="red",lwd=3)
|
||||
}
|
||||
print(results)
|
||||
|
||||
par(mfcol=c(2,1))
|
||||
for ( Qt in QS ) {
|
||||
x <- subset(results, Q == Qt)
|
||||
print(x)
|
||||
plot(x$depth, x$sensitivity, type="b", main = paste("Q score", Qt), xlab = "Depth", ylab="Sensitivity")
|
||||
plot(x$depth, x$specificity, type="b", xlab = "Depth", ylab="Specificity")
|
||||
}
|
||||
|
||||
par(mfcol=c(1,1))
|
||||
plot(0,0, type="n", frame.plot=F, ann=F, axes=F)
|
||||
addtable2plot(-1, -1, data.frame(Q=results$Q, mode=results$mode, depth=results$depth, sensitivity=format(results$sensitivity, digits=2), specificity = format(results$specificity, digits=2)))
|
||||
|
||||
|
||||
if ( onCMDLine ) dev.off()
|
||||
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
args = commandArgs(TRUE)
|
||||
onCMDLine = ! is.na(args[1])
|
||||
|
||||
if ( onCMDLine ) {
|
||||
reference_dataset = '/Users/mhanna/metrics.perSample.formatted.table'
|
||||
inputTSV = args[1]
|
||||
outputPDF = args[2]
|
||||
} else {
|
||||
reference_dataset = '/Users/mhanna/metrics.perSample.formatted.table'
|
||||
inputTSV = 'GoT2D_exomes_batch_005.tsv'
|
||||
outputPDF = 'T2D.pdf'
|
||||
}
|
||||
|
||||
require('ggplot2')
|
||||
|
||||
data <- read.table(inputTSV,header=T)
|
||||
|
||||
complete <- read.table(reference_dataset,header=T)
|
||||
novel <- subset(complete,exon_intervals == "whole_exome_agilent_1.1_refseq_plus_3_boosters"&Novelty=="novel"&FunctionalClass=="all")
|
||||
selected_samples <- novel$Sample %in% data$sample
|
||||
novel_with_highlights <- cbind(novel,selected_samples)
|
||||
|
||||
if(onCMDLine) {
|
||||
fingerprint_lods = list()
|
||||
for(i in 1:nrow(data)) {
|
||||
fingerprint_lods[[as.character(data$sample[i])]] <- eval(parse(text=data$FINGERPRINT_LODS[i]))
|
||||
}
|
||||
|
||||
fingerprint_lod_order = order(unlist(lapply(fingerprint_lods,median),use.names=F))
|
||||
|
||||
pdf(outputPDF)
|
||||
boxplot(fingerprint_lods[fingerprint_lod_order],las=3,main='Fingerprint LOD Scores By Sample',xlab='Sample',ylab='LOD Score Distribution',cex.axis=0.65)
|
||||
|
||||
qplot(Sample,Selected_Bases_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='On+Near Bait Bases/PF Bases Aligned per Sample')
|
||||
qplot(Sample,Mean_Target_Coverage,data=novel_with_highlights,color=selected_samples) + opts(title='Mean Target Coverage per Sample')
|
||||
qplot(Sample,Zero_Coverage_Targets_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% of Targets with <2x Coverage per Sample')
|
||||
qplot(Sample,Fold_80_Base_Penalty,data=novel_with_highlights,color=selected_samples) + opts(title='Fold 80 Base Penalty per Sample')
|
||||
qplot(Sample,Target_Bases_20x_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% Target Bases Achieving >20x Coverage per Sample')
|
||||
qplot(Sample,PF_Reads_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned per Sample')
|
||||
qplot(Sample,PF_HQ_Error_Rate,data=novel_with_highlights,color=selected_samples) + opts(title='% HQ Bases mismatching the Reference per Sample')
|
||||
qplot(Sample,Mean_Read_Length,data=novel_with_highlights,color=selected_samples) + opts(title='Median Read Length per Sample')
|
||||
qplot(Sample,Bad_Cycles,data=novel_with_highlights,color=selected_samples) + opts(title='# Bad Cycles per Sample')
|
||||
qplot(Sample,Strand_Balance_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned to the + Strand per Sample')
|
||||
qplot(Sample,Total_SNPs,data=novel_with_highlights,color=selected_samples) + opts(title='# SNPs called per Sample')
|
||||
qplot(Sample,dbSNP_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% SNPs in dbSNP per Sample')
|
||||
qplot(PCT_DBSNP,data=data,geom="histogram") + opts(title='% SNPs in dbSNP per Sample')
|
||||
dev.off()
|
||||
} else {
|
||||
print('Plotting command-line arguments')
|
||||
qplot(Sample,PF_Reads_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned per Sample')
|
||||
}
|
||||
|
||||
#qplot(Sample,Library_Size_HS,data=novel_with_highlights,color=selected_samples) + opts(title='Hybrid Sequencing Library Size per Sample')
|
||||
#qplot(Sample,MEDIAN_INSERT_SIZE,data=novel_with_highlights,color=selected_samples) + opts(title='Median Insert Size per Sample')
|
||||
#qplot(Sample,PCT_CHIMERAS,data=novel_with_highlights,color=selected_samples) + opts(title='% Chimera Read Pairs per Sample')
|
||||
#qplot(Sample,PCT_ADAPTER,data=novel_with_highlights,color=selected_samples) + opts(title='% Unaligned Reads Matching an Adapter Sequence per Sample')
|
||||
#qplot(Sample,NOVEL_SNPS,data=novel_with_highlights,color=selected_samples) + opts(title='# Novel SNPs called per Sample')
|
||||
#qplot(Sample,DBSNP_TITV,data=novel_with_highlights,color=selected_samples) + opts(title='TiTv of SNPs in dbSNP per Sample')
|
||||
298
R/exomeQC.R
298
R/exomeQC.R
|
|
@ -1,298 +0,0 @@
|
|||
library("gsalib", lib.loc="/Users/depristo/Desktop/broadLocal/GATK/trunk/R/")
|
||||
require("ggplot2")
|
||||
require("gplots")
|
||||
|
||||
# TODOs:
|
||||
# Assumes you have indels in your call set. If not you will get errors
|
||||
# Create pre/post calling sections
|
||||
# Allow conditional use of the preQCFile (where it's not available)
|
||||
|
||||
args = commandArgs(TRUE)
|
||||
onCMDLine = ! is.na(args[1])
|
||||
LOAD_DATA = T
|
||||
|
||||
# creates an array of c(sampleName1, ..., sampleNameN)
|
||||
parseHighlightSamples <- function(s) {
|
||||
return(unlist(strsplit(s, ",", fixed=T)))
|
||||
}
|
||||
|
||||
preQCFile = NA
|
||||
if ( onCMDLine ) {
|
||||
ProjectName = args[1]
|
||||
VariantEvalRoot = args[2]
|
||||
outputPDF = args[3]
|
||||
if ( ! is.na(args[4]) )
|
||||
preQCFile = args[4]
|
||||
if ( ! is.na(args[5]) )
|
||||
highlightSamples = parseHighlightSamples(args[5])
|
||||
else
|
||||
highlightSamples = c()
|
||||
} else {
|
||||
ProjectName = "InDevelopmentInR"
|
||||
preQCFile <- NA # "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005_per_sample_metrics.tsv"
|
||||
#VariantEvalRoot <- "qcTestData//ESPGO_Gabriel_NHLBI_eomi_june_2011_batch1"
|
||||
VariantEvalRoot <- "qcTestData/MC_Engle_11_Samples_06092011"
|
||||
outputPDF = "bar.pdf"
|
||||
highlightSamples = c() # parseHighlightSamples("29029,47243")
|
||||
}
|
||||
|
||||
print("Report")
|
||||
print(paste("Project :", ProjectName))
|
||||
print(paste("VariantEvalRoot :", VariantEvalRoot))
|
||||
print(paste("outputPDF :", outputPDF))
|
||||
print(paste("preQCFile :", preQCFile))
|
||||
print(paste("highlightSamples :", highlightSamples))
|
||||
|
||||
expandVEReport <- function(d) {
|
||||
d$TiTvVariantEvaluator$tiTvRatio = round(d$TiTvVariantEvaluator$tiTvRatio,2)
|
||||
d$CountVariants$deletionInsertionRatio = round(d$CountVariants$deletionInsertionRatio,2)
|
||||
d$CountVariants$nIndels = d$CountVariants$nInsertions + d$CountVariants$nDeletions
|
||||
return(d)
|
||||
}
|
||||
|
||||
# -------------------------------------------------------
|
||||
# Utilities for displaying multiple plots per page
|
||||
# -------------------------------------------------------
|
||||
|
||||
# Viewport (layout 2 graphs top to bottom)
|
||||
distributePerSampleGraph <- function(distgraph, perSampleGraph, heights = c(2,1)) {
|
||||
Layout <- grid.layout(nrow = 2, ncol = 1, heights=heights)
|
||||
grid.newpage()
|
||||
pushViewport(viewport(layout = Layout))
|
||||
subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1)
|
||||
print(perSampleGraph, vp = subplot(1))
|
||||
print(distgraph, vp = subplot(2))
|
||||
}
|
||||
|
||||
createMetricsBySites <- function(VariantEvalRoot, PreQCMetrics) {
|
||||
# Metrics by sites:
|
||||
# bySite -> counts of SNPs and Indels by novelty, with expectations
|
||||
# byAC -> snps and indels (known / novel)
|
||||
r = list( bySite = expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".summary.eval", sep=""))),
|
||||
byAC = gsa.read.gatkreport(paste(VariantEvalRoot, ".byAC.eval", sep="")))
|
||||
r$byAC$CountVariants$nIndels = r$byAC$CountVariants$nInsertions + r$byAC$CountVariants$nDeletions
|
||||
r$byAC$TiTvVariantEvaluator$nSNPs = r$byAC$TiTvVariantEvaluator$nTi + r$byAC$TiTvVariantEvaluator$nTv
|
||||
r$byAC$CountVariants$AC = r$byAC$CountVariants$AlleleCount
|
||||
r$byAC$TiTvVariantEvaluator$AC = r$byAC$TiTvVariantEvaluator$AlleleCount
|
||||
return(r)
|
||||
}
|
||||
|
||||
summaryTable <- function(metricsBySites, metricsBySample) {
|
||||
# SNP summary statistics
|
||||
merged = merge(metricsBySites$bySite$CountVariants, metricsBySites$bySite$TiTvVariantEvaluator)
|
||||
sub <- subset(merged, FunctionalClass=="all")
|
||||
raw = melt(sub, id.vars=c("Novelty"), measure.vars=c("nProcessedLoci", "nSNPs", "tiTvRatio", "nIndels", "deletionInsertionRatio"))
|
||||
table = cast(raw, Novelty ~ ...)
|
||||
# doesn't work with textplot
|
||||
colnames(table) <- c("Novelty", "Target size (bp)", "No. SNPs", "Ti/Tv", "No. Indels", "deletion/insertion ratio")
|
||||
return(table)
|
||||
}
|
||||
|
||||
sampleSummaryTable <- function(metricsBySample) {
|
||||
# SNP summary statistics
|
||||
raw <- melt(metricsBySamples, id.vars=c("Novelty", "Sample"), measure.vars=c("nProcessedLoci", "nSNPs", "tiTvRatio", "nIndels", "deletionInsertionRatio"))
|
||||
table = cast(raw, Novelty ~ variable, mean)
|
||||
table$nSNPs <- round(table$nSNPs, 0)
|
||||
table$nIndels <- round(table$nIndels, 0)
|
||||
table$tiTvRatio <- round(table$tiTvRatio, 2)
|
||||
table$deletionInsertionRatio <- round(table$deletionInsertionRatio, 2)
|
||||
colnames(table) <- c("Novelty", "Target size (bp)", "No. SNPs", "Ti/Tv", "No. Indels", "deletion/insertion ratio")
|
||||
return(table)
|
||||
}
|
||||
|
||||
overallSummaryTable <- function(metricsBySites, metricsBySamples) {
|
||||
sitesSummary <- as.data.frame(summaryTable(metricsBySites, metricsBySamples))
|
||||
sitesSummary$Metric.Type <- "Sites"
|
||||
sampleSummary <- as.data.frame(sampleSummaryTable(metricsBySamples))
|
||||
sampleSummary$Metric.Type <- "Per-sample avg."
|
||||
# that last item puts the metric.type second in the list
|
||||
return(rbind(sitesSummary, sampleSummary)[, c(1,7,2,3,4,5,6)])
|
||||
}
|
||||
|
||||
summaryPlots <- function(metricsBySites) {
|
||||
name = "SNP and Indel count by novelty and allele frequency"
|
||||
molten = melt(subset(metricsBySites$byAC$CountVariants, Novelty != "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(c("nSNPs", "nIndels")))
|
||||
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty, fill=Novelty), group=variable)
|
||||
p <- p + opts(title = name)
|
||||
p <- p + scale_y_log10("Number of variants")
|
||||
p <- p + geom_point(alpha=0.5, size=3)
|
||||
p <- p + geom_line(size=1)
|
||||
p <- p + facet_grid(variable ~ ., scales="free")
|
||||
p <- p + scale_x_continuous("Allele count (AC)")
|
||||
p2 <- p + scale_x_log10("Allele count (AC)")
|
||||
p2 <- p2 + opts(title = "")
|
||||
distributePerSampleGraph(p2, p, c(1,1))
|
||||
|
||||
# Counts vs. Allele frequency
|
||||
name = "Variant counts by allele count"
|
||||
for ( measure in c("nSNPs", "nIndels")) {
|
||||
molten = melt(subset(metricsBySites$byAC$CountVariants, AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(measure))
|
||||
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty), group=variable)
|
||||
p <- p + opts(title = paste(name, ":", measure))
|
||||
p <- p + scale_y_log10("Number of variants")
|
||||
p <- p + scale_x_log10("Allele count (AC)")
|
||||
p <- p + geom_point(alpha=0.5, size=4)
|
||||
p <- p + geom_smooth(aes(weight=value), size=1, method="lm", formula = y ~ x)
|
||||
p <- p + facet_grid(Novelty ~ ., scales="free")
|
||||
print(p)
|
||||
}
|
||||
|
||||
name = "Transition / transversion ratio by allele count"
|
||||
# nSNPs > 0 => requires that we have some data here, otherwise Ti/Tv is zero from VE
|
||||
minSNPsToInclude = 0
|
||||
byACNoAll = subset(metricsBySites$byAC$TiTvVariantEvaluator, Novelty != "all" & AC > 0 & nSNPs > minSNPsToInclude)
|
||||
p <- ggplot(data=byACNoAll, aes(x=AC, y=tiTvRatio, color=Novelty))
|
||||
p <- p + scale_y_continuous("Transition / transversion ratio", limits=c(0,4))
|
||||
p <- p + opts(title = name)
|
||||
p <- p + geom_smooth(size=2)
|
||||
p <- p + geom_point(aes(size=log10(nSNPs), weight=nSNPs), alpha=0.5)
|
||||
p <- p + scale_x_continuous("Allele count (AC)")
|
||||
p2 <- p + scale_x_log10("Allele count (AC)")
|
||||
p2 <- p2 + opts(title = "")
|
||||
distributePerSampleGraph(p2, p, c(1,1))
|
||||
|
||||
# SNPs to indels ratio by allele frequency
|
||||
name = "SNPs to indels ratio by allele frequency"
|
||||
metricsBySites$byAC$CountVariants$SNP.Indel.Ratio = metricsBySites$byAC$CountVariants$nSNPs / metricsBySites$byAC$CountVariants$nIndels
|
||||
metricsBySites$byAC$CountVariants$SNP.Indel.Ratio[metricsBySites$byAC$CountVariants$nIndels == 0] = NaN
|
||||
p <- ggplot(data=subset(metricsBySites$byAC$CountVariants, Novelty == "all" & nSNPs > 0), aes(x=AC, y=SNP.Indel.Ratio))
|
||||
p <- p + opts(title = name)
|
||||
p <- p + scale_y_continuous("SNP to indel ratio")
|
||||
#p <- p + scale_y_log10()
|
||||
p <- p + geom_point(alpha=0.5, aes(size=log10(nIndels)))
|
||||
p <- p + geom_smooth(size=2, aes(weight=nIndels))
|
||||
print(p)
|
||||
|
||||
name = "SNP counts by functional class"
|
||||
molten = melt(subset(metricsBySites$bySite$CountVariants, Novelty != "all" & FunctionalClass != "all"), id.vars=c("Novelty", "FunctionalClass"), measure.vars=c(c("nSNPs")))
|
||||
p <- ggplot(data=molten, aes(x=FunctionalClass, y=value, fill=Novelty), group=FunctionalClass)
|
||||
p <- p + opts(title = name)
|
||||
p <- p + scale_y_log10("No. of SNPs")
|
||||
p <- p + geom_bar(position="dodge")
|
||||
print(p)
|
||||
}
|
||||
|
||||
addSection <- function(name) {
|
||||
par("mar", c(5, 4, 4, 2))
|
||||
frame()
|
||||
title(name, cex=2)
|
||||
}
|
||||
|
||||
# -------------------------------------------------------
|
||||
# read functions
|
||||
# -------------------------------------------------------
|
||||
|
||||
createMetricsBySamples <- function(VariantEvalRoot) {
|
||||
bySampleEval <- expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".bySample.eval", sep="")))
|
||||
r = merge(bySampleEval$TiTvVariantEvaluator, bySampleEval$CountVariants)
|
||||
r = merge(r, bySampleEval$CompOverlap)
|
||||
if ( ! is.na(preQCFile) ) {
|
||||
preQCMetrics <- read.table(preQCFile, header=T)
|
||||
r = merge(r, preQCMetrics)
|
||||
}
|
||||
# order the samples by nSNPs -- it's the natural ordering.
|
||||
x = subset(r, Novelty=="all")
|
||||
r$Sample <- factor(x$Sample, levels=x$Sample[order(x$nSNPs)])
|
||||
|
||||
# add highlight info
|
||||
r$highlight = r$Sample %in% highlightSamples
|
||||
|
||||
#r = merge(merge(preQCMetrics, byACEval$TiTvVariantEvaluator), byACEval$CountVariants)
|
||||
return(subset(r, Sample != "all"))
|
||||
}
|
||||
|
||||
# -------------------------------------------------------
|
||||
# Per sample plots
|
||||
# -------------------------------------------------------
|
||||
|
||||
perSamplePlots <- function(metricsBySamples) {
|
||||
metricsBySamples$highlightTextSizes = c(1,2)[metricsBySamples$highlight+1]
|
||||
sampleTextLabel <- geom_text(aes(label=Sample, size=highlightTextSizes))
|
||||
sampleTextLabelScale <- scale_size("Highlighted samples", to=c(3,5), breaks=c(1,2), labels=c("regular", "highlighted"))
|
||||
xAxis <- scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "")
|
||||
|
||||
measures = c("nSNPs", "tiTvRatio", "nSingletons", "nIndels", "deletionInsertionRatio")
|
||||
name = "by sample"
|
||||
for ( measure in measures ) {
|
||||
molten = melt(metricsBySamples, id.vars=c("Novelty", "Sample", "highlightTextSizes"), measure.vars=c(measure))
|
||||
|
||||
# distribution
|
||||
p1 <- ggplot(data=molten, aes(x=value, group=Novelty, fill=Novelty))
|
||||
#p1 <- p1 + opts(title = paste(measure, name))
|
||||
p1 <- p1 + geom_density(alpha=0.5)
|
||||
p1 <- p1 + geom_rug(aes(y=NULL, color=Novelty, position="jitter"))
|
||||
p1 <- p1 + scale_x_continuous(measure)
|
||||
|
||||
p2 <- ggplot(data=molten, aes(x=Sample, y=value, group=Novelty, color=Novelty), y=value)
|
||||
p2 <- p2 + opts(title = paste(measure, name))
|
||||
p2 <- p2 + geom_smooth(alpha=0.5, aes(group=Novelty))
|
||||
p2 <- p2 + sampleTextLabel + sampleTextLabelScale
|
||||
p2 <- p2 + facet_grid(Novelty ~ ., scales="free")
|
||||
p2 <- p2 + xAxis
|
||||
|
||||
distributePerSampleGraph(p1, p2)
|
||||
}
|
||||
|
||||
# known / novel ratio by sample
|
||||
# TODO -- would ideally not conflate SNPs and Indels
|
||||
d = subset(metricsBySamples, Novelty == "all" & CompRod == "dbsnp")
|
||||
title <- opts(title = "Novelty rate by sample")
|
||||
|
||||
# distribution
|
||||
p1 <- ggplot(data=d, aes(x=compRate))
|
||||
p1 <- p1 + geom_density(alpha=0.5)
|
||||
p1 <- p1 + geom_rug(aes(y=NULL, position="jitter"))
|
||||
p1 <- p1 + scale_x_continuous("Percent of variants in dbSNP")
|
||||
|
||||
p2 <- ggplot(data=d, aes(x=Sample, y=compRate))
|
||||
p2 <- p2 + title
|
||||
p2 <- p2 + geom_smooth(alpha=0.5, aes(group=Novelty))
|
||||
p2 <- p2 + sampleTextLabel + sampleTextLabelScale
|
||||
p2 <- p2 + geom_rug(aes(x=NULL, position="jitter"))
|
||||
p2 <- p2 + xAxis
|
||||
p2 <- p2 + scale_y_continuous("Percent of variants in dbSNP")
|
||||
distributePerSampleGraph(p1, p2)
|
||||
|
||||
for ( novelty in c("all", "known", "novel") ) {
|
||||
# TODO -- how can I color it as before?
|
||||
# TODO -- add marginal distributions?
|
||||
molten = melt(subset(metricsBySamples, Novelty==novelty), id.vars=c("Sample", "highlightTextSizes"), measure.vars=measures)
|
||||
p <- ggplot(data=molten, aes(x=Sample, y=value))
|
||||
p <- p + opts(title = paste(name, ":", novelty))
|
||||
# p <- p + scale_y_log10("Number of variants")
|
||||
# p <- p + geom_point(alpha=0.5, size=4)
|
||||
p <- p + sampleTextLabel + sampleTextLabelScale
|
||||
p <- p + facet_grid(variable ~ ., scales="free")
|
||||
# how do we remove the labels?
|
||||
p <- p + xAxis
|
||||
print(p)
|
||||
}
|
||||
}
|
||||
|
||||
# -------------------------------------------------------
|
||||
# Actually invoke the above plotting functions
|
||||
# -------------------------------------------------------
|
||||
|
||||
# load the data.
|
||||
if ( onCMDLine || LOAD_DATA ) {
|
||||
metricsBySites <- createMetricsBySites(VariantEvalRoot)
|
||||
metricsBySamples <- createMetricsBySamples(VariantEvalRoot)
|
||||
}
|
||||
|
||||
if ( ! is.na(outputPDF) ) {
|
||||
pdf(outputPDF, height=8.5, width=11)
|
||||
}
|
||||
|
||||
# Table of overall counts and quality
|
||||
textplot(overallSummaryTable(metricsBySites), show.rownames=F)
|
||||
title(paste("Summary metrics for project", ProjectName), cex=3)
|
||||
# textplot(as.data.frame(sampleSummaryTable(metricsBySamples)), show.rownames=F)
|
||||
# title(paste("Summary metrics per sample for project", ProjectName), cex=3)
|
||||
|
||||
summaryPlots(metricsBySites)
|
||||
perSamplePlots(metricsBySamples)
|
||||
|
||||
if ( ! is.na(outputPDF) ) {
|
||||
dev.off()
|
||||
}
|
||||
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
#########################################################################
|
||||
# this script generates a plot of sample depth of coverage over the MHC.
|
||||
# It's rather specific to that use case, but is a good example of getting
|
||||
# Loess curve generation to work given a X/Y dataset.
|
||||
#
|
||||
# 12/9/2009
|
||||
# -Aaron
|
||||
#########################################################################
|
||||
|
||||
# setup our output PNG
|
||||
png(filename="bySampleJPName.png",width=1500,height=700,bg="white")
|
||||
|
||||
# input our data set
|
||||
tbl <- read.csv("docOutJP.csv",header=TRUE) # doc_JP_SN_totalled_clean.csv
|
||||
|
||||
par(las=1) # make all labels horizontal
|
||||
par(xpd=T, mar=par()$mar+c(0,0,-2,4)) # adjust the margins to accommodate our legend
|
||||
|
||||
# do the initial plot of one column of data
|
||||
plot(tbl[,1],tbl[,5],xlim=c(18517983,41461957),ylim=c(0,7),type="p",cex=0.2,axes=F,ylab="Average Read Depth Of Coverage",xlab="MHC Location",col=rgb(0,0,0,0.1))
|
||||
|
||||
# add the custom x and y axis, so we can control their layout
|
||||
axis(1,pos=0,at=seq(18517983,42061957,by=500000),col.axis="black")
|
||||
axis(2,pos=18517983,at=seq(0,7,by=1),col="black")
|
||||
|
||||
# setup two color schemes, both with the same colors. One has an alpha of 0.08 for the background points,
|
||||
# and the other is alpha=1 for the lines (which we want to be vibrant in the foreground)
|
||||
myColors <- rainbow(30,alpha=0.08)
|
||||
myColors2 <- rainbow(30)
|
||||
|
||||
# add a legend. There is a better way to do this besides hard-coding it, but it wouldn't render correctly on my machine
|
||||
legend(x=41000000,y=5,c("NA18940","NA18942","NA18943","NA18944","NA18945","NA18947","NA18948","NA18949","NA18951","NA18952","NA18953","NA18956","NA18959","NA18960","NA18961","NA18964","NA18965","NA18967","NA18968","NA18969","NA18970","NA18971","NA18972","NA18973","NA18974","NA18975","NA18976","NA18980","NA18981","NA19005"),horiz=FALSE,lty=c(1),col=c(myColors2),cex=0.8)
|
||||
|
||||
# loop over the remaining data sets, adding first the points to the graph, then calculating the loess points, and finally combining the points into a line
|
||||
# the loess smoothing parts were inspired by: http://research.stowers-institute.org/efg/R/Statistics/loess.htm
|
||||
# adjust the span value to adjust the sensitivity of curve to the local fit.
|
||||
for (i in 4:33) {
|
||||
points(tbl[,1],tbl[,i],col=myColors[i],cex=0.2)
|
||||
y.loess <- loess(y ~ x, span=0.05, data.frame(x=tbl[,1], y=tbl[,i]))
|
||||
y.predict <- predict(y.loess, data.frame(x=tbl[,1]))
|
||||
lines(tbl[,1],y.predict,col=myColors2[i])
|
||||
}
|
||||
|
||||
# close our png
|
||||
dev.off()
|
||||
|
|
@ -1,323 +0,0 @@
|
|||
# pOneSiteIsHom = p(top chromosome is ref AND bottom chromosome is ref) + p(top chromosome is var AND bottom chromosome is var)
|
||||
# = (1-theta)^2 + theta^2
|
||||
#
|
||||
# pOneSiteIsHet = p(top chromosome is ref AND bottom chromosome is var) + p(top chromosome is var AND bottom chromosome is ref)
|
||||
# = (1-theta)*theta + theta*(1-theta) = 2*theta*(1-theta)
|
||||
pOneSiteIsHet <- function(theta) {
|
||||
2 * theta * (1 - theta)
|
||||
}
|
||||
|
||||
# p = 2 * theta * (1 - theta)
|
||||
# and mean intra-het distance = 1/p, or d = 1/p
|
||||
# or: p = 1/d
|
||||
# or: 2 * theta * (1 - theta) = 1/d
|
||||
# theta * (1 - theta) = 1/2d
|
||||
# - theta^2 + theta - 1/2d = 0
|
||||
#
|
||||
# Using the quadratic equation:
|
||||
# (- b + (b^2 - 4*a*c)^0.5) / 2a
|
||||
# (-1 + (1 - 2/d)^0.5) / -2
|
||||
meanIntraHetDistanceToTheta <- function(d) {
|
||||
(-1 + (1 - 2/d)^0.5) / -2
|
||||
}
|
||||
|
||||
# For consecutive diploid het sites x and y, P(distance(x,y) = k)
|
||||
# = P(site y is the first het site downstream of x at distance = k | het site x exists at its location).
|
||||
# That is, het site x already "exists", and we want to know what the probability that the NEXT het site (y) is k bases away.
|
||||
pHetPairAtDistance <- function(k, theta) {
|
||||
pOneSiteIsHetTheta = pOneSiteIsHet(theta)
|
||||
dexp(k, pOneSiteIsHetTheta)
|
||||
}
|
||||
|
||||
# Since the geometric/exponential distribution is "memory-free", can simply multiply the (independent) probabilities for the distances:
|
||||
pHetPairsAtDistances <- function(dists, theta) {
|
||||
prod(pHetPairAtDistance(dists, theta))
|
||||
}
|
||||
|
||||
# Sample numDists distances from the intra-het distance distribution.
|
||||
# [since the geometric/exponential distribution is "memory-free", can simply **independently** sample from the distribution]:
|
||||
sampleIntraHetDistances <- function(numDists, theta) {
|
||||
pOneSiteIsHetTheta = pOneSiteIsHet(theta)
|
||||
ceiling(rexp(numDists, pOneSiteIsHetTheta)) # round up to get whole-number distances starting from 1
|
||||
}
|
||||
|
||||
# For consecutive diploid het sites x and y, P(distance(x,y) <= k)
|
||||
pHetPairLteDistance <- function(k, theta) {
|
||||
# Although the real minimum distance starts with 1 (geometric distribution), the exponential distribution approximation starts with 0:
|
||||
MIN_DISTANCE = 0
|
||||
|
||||
Vectorize(function(maxDist) integrate(function(dist) pHetPairAtDistance(dist, theta), lower=MIN_DISTANCE, upper=maxDist)$value)(k)
|
||||
}
|
||||
|
||||
# Probability (over locations of x on the read) that a paired-end read ALREADY covering site x [with 2 mates of length L reading a fragment of length F] will ALSO cover site y (k bases downstream of x):
|
||||
#
|
||||
# If read 1 in mate spans [s1, e1] and read 2 spans [s2, e2], where length(read 1) = e1 - s1 + 1 = length(read 2) = e2 - s2 + 1 = L, then i = s2 - e1 - 1 [BY DEFINITION of i].
|
||||
# i == "insert size" is DEFINED AS: F - 2 * L
|
||||
#
|
||||
#
|
||||
# FOR i >= 0:
|
||||
#
|
||||
# Assume that read is equally likely to cover x at any of the 2L positions, so uniform probability of 1/2L at each of them.
|
||||
# P(read r covers (x,y) | r covers x, r = [L,i,L], distance(x,y) = k)
|
||||
# = sum_p=1^p=L {1/2L * 1{k <= L-p OR L-p+i+1 <= k <= 2L+i-p}} + sum_p=1^p=L {1/2L * 1{k <= L-p}}
|
||||
# = 1/2L * [2 * sum_p=1^p=L {1{k <= L-p}} + sum_p=1^p=L {1{L-p+i+1 <= k <= 2L+i-p}}]
|
||||
# = 1/2L * [2 * max(0, L-k) + max(0, min(L, max(0, k-i)) - max(0, k-i-L))]
|
||||
#
|
||||
#
|
||||
pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance <- function(L, F, k) {
|
||||
if (min(F) < 1) {
|
||||
stop("Cannot have fragments of size < 1")
|
||||
}
|
||||
|
||||
# if F < L, then set the effective read length to be F:
|
||||
L = pmin(L, F)
|
||||
|
||||
i = F - 2 * L
|
||||
#print(paste("pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L= (", paste(L, collapse=", "), "), F= (", paste(F, collapse=", "), "), k= (", paste(k, collapse=", "), ")), i= (", paste(i, collapse=", "), ")", sep=""))
|
||||
|
||||
# If i < 0, then ASSUMING that overlapping region is identical, we can "pretend" to have 2 reads of length L and L+i, with no insert between them.
|
||||
# Otherwise, leave i alone and L1 = L2 = L:
|
||||
L1 = L
|
||||
L2 = L + pmin(0, i) # set effective length of second read to L+i if i < 0
|
||||
i = pmax(0, i) # set effective insert size to be >= 0
|
||||
|
||||
|
||||
pWithinSameMate = pmax(0, L1 - k) + pmax(0, L2 - k)
|
||||
|
||||
#maxValueFor_p = pmin(L1, pmax(0, k - i))
|
||||
#minValueFor_p_minusOne = pmax(0, k - i - L2)
|
||||
|
||||
maxValueFor_p = pmin(L1, L1 + L2 + i - k)
|
||||
minValueFor_p_minusOne = pmax(0, L1 - k + i)
|
||||
pInDifferentMates = pmax(0, maxValueFor_p - minValueFor_p_minusOne)
|
||||
|
||||
(pWithinSameMate + pInDifferentMates) / (L1 + L2)
|
||||
}
|
||||
|
||||
# Probability of having a fragment of size fragmentSize, where the fragment sizes are normally distributed with mean Fm and standard deviation Fs:
|
||||
pFragmentSize <- function(fragmentSize, Fm, Fs) {
|
||||
dnorm(fragmentSize, mean = Fm, sd = Fs)
|
||||
}
|
||||
|
||||
# Probability (over locations of x on the read, and fragment sizes) that there could exist a paired-end read [with 2 mates of length L covering a fragment] covers both sites x and y (at distance k):
|
||||
# Integral_from_0^to_INFINITY { pFragmentSize(s, Fm, Fs) * pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, s, k) ds }
|
||||
pFragmentsReadsCanCoverHetPairAtDistance <- function(L, k, Fm, Fs) {
|
||||
if (Fs != 0) {
|
||||
pCoverageBySpecificFragment <- function(s) {pFragmentSize(s, Fm, Fs) * pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, s, k)}
|
||||
|
||||
MAX_NUM_SD = 10
|
||||
maxDistance = MAX_NUM_SD * Fs
|
||||
minFragmentSize = max(1, Fm - maxDistance) # NOT meaningful to have fragment size < 1
|
||||
maxFragmentSize = Fm + maxDistance
|
||||
|
||||
integrate(pCoverageBySpecificFragment, lower=minFragmentSize, upper=maxFragmentSize)$value
|
||||
}
|
||||
else {# All fragments are of size exactly Fm:
|
||||
pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, Fm, k)
|
||||
}
|
||||
}
|
||||
|
||||
# Probability (over locations of x on the read, fragment sizes, and read depths) that there exist at least nReadsToPhase paired-end reads covering both sites x and y (at distance k):
|
||||
# = Sum_from_d=0^to_d=2*meanDepth { p(having d reads | poisson with meanDepth) * p(there at least nReadsToPhase succeed in phasing x,y | given d reads in total) }
|
||||
# p(having d reads | poisson with meanDepth) = dpois(d, meanDepth)
|
||||
# p(there are at least nReadsToPhase that succeed in phasing x,y | given d reads in total) = pbinom(nReadsToPhase - 1, k, pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs), lower.tail = FALSE)
|
||||
pDirectlyPhaseHetPairAtDistanceUsingDepth_SINGLE_k <- function(meanDepth, nReadsToPhase, L, k, Fm, Fs) {
|
||||
THRESH = 10^-8
|
||||
p = pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs)
|
||||
|
||||
# deal with numerical issues:
|
||||
if (abs(1 - p) < THRESH) {
|
||||
p = 1
|
||||
}
|
||||
else if (abs(p) < THRESH) {
|
||||
p = 0
|
||||
}
|
||||
|
||||
pAtLeastNreadsToPhaseGivenDepth <- function(d) pbinom(nReadsToPhase - 1, d, p, lower.tail = FALSE)
|
||||
pAtLeastNreadsToPhaseAndDepth <- function(d) dpois(d, meanDepth) * pAtLeastNreadsToPhaseGivenDepth(d)
|
||||
|
||||
minDepth = 0
|
||||
maxDepth = 2 * meanDepth
|
||||
sum(apply(as.matrix(minDepth:maxDepth), 1, pAtLeastNreadsToPhaseAndDepth))
|
||||
}
|
||||
|
||||
pDirectlyPhaseHetPairAtDistanceUsingDepth <- function(meanDepth, nReadsToPhase, L, k, Fm, Fs) {
|
||||
Vectorize(function(dist) pDirectlyPhaseHetPairAtDistanceUsingDepth_SINGLE_k(meanDepth, nReadsToPhase, L, dist, Fm, Fs))(k)
|
||||
}
|
||||
|
||||
pDirectlyPhaseHetPairAndDistanceUsingDepth <- function(meanDepth, nReadsToPhase, L, k, theta, Fm, Fs) {
|
||||
pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, Fm, Fs) * pHetPairAtDistance(k, theta)
|
||||
}
|
||||
|
||||
# Probability (over locations of x on the read, fragment sizes, read depths, and het-het distances) that that there exist at least nReadsToPhase paired-end reads covering both sites x and y (where the distance between x and y is as per the geometric/exponential distribution):
|
||||
pDirectlyPhaseHetPair <- function(meanDepth, nReadsToPhase, L, theta, Fm, Fs) {
|
||||
# Although the real minimum distance starts with 1 (geometric distribution), the exponential distribution approximation starts with 0:
|
||||
MIN_DISTANCE = 0
|
||||
MAX_DISTANCE = Inf
|
||||
|
||||
iRes = integrate(function(k) pDirectlyPhaseHetPairAndDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, theta, Fm, Fs), lower=MIN_DISTANCE, upper=MAX_DISTANCE, subdivisions=1000, stop.on.error = FALSE)
|
||||
if (iRes$message != "OK") {
|
||||
print(paste("DISTANCE INTEGRATION WARNING: ", iRes$message, sep=""))
|
||||
}
|
||||
iRes$value
|
||||
}
|
||||
|
||||
# Probability (over locations of sites on reads, fragment sizes, and read depths) that paired-end reads can TRANSITIVELY phase phaseIndex relative to phaseIndex - 1, given a window of length(windowDistances)+1 het sites at distances given by windowDistances (where an edge in the transitive path requires at least nReadsToPhase reads):
|
||||
pPhaseHetPairAtDistanceUsingDepthAndWindow <- function(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs, MIN_PATH_PROB = 10^-6) {
|
||||
n = length(windowDistances) + 1 # the window size
|
||||
if (phaseIndex < 2 || phaseIndex > n) {
|
||||
stop("phaseIndex < 2 || phaseIndex > n")
|
||||
}
|
||||
#print(paste("windowDistances= (", paste(windowDistances, collapse=", "), ")", sep=""))
|
||||
|
||||
# A. Pre-compute the upper diagonal of square matrix of n CHOOSE 2 values of:
|
||||
# pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, dist(i,j), Fm, Fs)
|
||||
#
|
||||
# NOTE that the probabilities of phasing different pairs are NOT truly independent, but assume this for convenience...
|
||||
#
|
||||
pPhasePair = matrix(data = 0, nrow = n, ncol = n)
|
||||
for (i in seq(from=1, to=n-1, by=1)) {
|
||||
for (j in seq(from=i+1, to=n, by=1)) {
|
||||
dist = distanceBetweenPair(i, j, windowDistances)
|
||||
#print(paste("distanceBetweenPair(", i, ", ", j, ", windowDistances) = ", dist, sep=""))
|
||||
|
||||
pPhaseIandJ = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, dist, Fm, Fs)
|
||||
pPhasePair[i, j] = pPhaseIandJ
|
||||
pPhasePair[j, i] = pPhaseIandJ
|
||||
}
|
||||
}
|
||||
#print(pPhasePair)
|
||||
|
||||
# B. We need to consider ALL possible paths from phaseIndex - 1 ---> phaseIndex
|
||||
# There are: sum_i=0^to_n-2 {n-2 CHOOSE i * i!} such paths.
|
||||
# Multiply the phasing probs along the path, and sum over all such paths:
|
||||
#
|
||||
startNode = phaseIndex - 1
|
||||
endNode = phaseIndex
|
||||
|
||||
possibleIntermediateNodes = vector()
|
||||
if (startNode > 1) possibleIntermediateNodes = c(possibleIntermediateNodes, seq(from=1, to=startNode-1, by=1))
|
||||
if (endNode < n) possibleIntermediateNodes = c(possibleIntermediateNodes, seq(from=endNode+1, to=n, by=1))
|
||||
#print(paste("possibleIntermediateNodes= {", paste(possibleIntermediateNodes, collapse=", "), "}", sep=""))
|
||||
|
||||
pWindowNotPhasing = 1
|
||||
library(gtools)
|
||||
for (subset in powerSet(length(possibleIntermediateNodes))) {
|
||||
subset = possibleIntermediateNodes[subset]
|
||||
#print((paste("subset = {", paste(subset, collapse=", "), "}", sep="")))
|
||||
|
||||
if (length(subset) == 0) {
|
||||
paths = c()
|
||||
}
|
||||
else {
|
||||
paths = permutations(length(subset), length(subset), v=subset)
|
||||
}
|
||||
# Add on the start and the end:
|
||||
paths = cbind(startNode, paths, endNode)
|
||||
|
||||
for (i in 1:nrow(paths)) {
|
||||
path = paths[i,]
|
||||
pSpecificPathPhases = 1
|
||||
for (j in seq(from=1, to=length(path)-1, by=1)) {
|
||||
pSpecificPathPhases = pSpecificPathPhases * pPhasePair[path[j], path[j+1]]
|
||||
if (pSpecificPathPhases < MIN_PATH_PROB) { # Do a "bounded" calculation [any path that is ALREADY of low probability can be discarded]:
|
||||
#print(paste("pSpecificPathPhases= ", pSpecificPathPhases, sep=""))
|
||||
pSpecificPathPhases = 0
|
||||
break
|
||||
}
|
||||
}
|
||||
pWindowNotPhasing = pWindowNotPhasing * (1 - pSpecificPathPhases)
|
||||
|
||||
#print((paste("path = (", paste(path, collapse=", "), "), pSpecificPathPhases= ", pSpecificPathPhases, sep="")))
|
||||
}
|
||||
}
|
||||
|
||||
1 - pWindowNotPhasing
|
||||
}
|
||||
|
||||
# distance(i,j) = distance(i,i+1) + ... + distance(j-1,j), where distance(i,i+1) is given by windowDistances(i):
|
||||
distanceBetweenPair <- function(i, j, windowDistances) {
|
||||
if (i > j) {
|
||||
tmp = i
|
||||
i = j
|
||||
j = tmp
|
||||
}
|
||||
if (i < 1 || j > length(windowDistances) + 1) {
|
||||
stop(paste(i, " = i < 1 || ", j, " = j > length(windowDistances) + 1 = ", length(windowDistances) + 1, sep=""))
|
||||
}
|
||||
|
||||
sum(windowDistances[i:(j-1)])
|
||||
}
|
||||
|
||||
# n = size of set for which power set is to be returned
|
||||
powerSet <- function(n) {
|
||||
library(sfsmisc)
|
||||
|
||||
subsets = list()
|
||||
for (i in seq(from=0, to=(2^n)-1, by=1)) {
|
||||
subsets[i+1] = list(which(digitsBase(i, base = 2, ndigits = n) == 1))
|
||||
}
|
||||
subsets
|
||||
}
|
||||
|
||||
pPhaseHetPairAndDistancesUsingDepthAndWindow <- function(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs, theta) {
|
||||
p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs) * pHetPairsAtDistances(windowDistances, theta)
|
||||
|
||||
#print(paste(p, " = pPhaseHetPairAndDistancesUsingDepthAndWindow(windowDistances= (", paste(windowDistances, collapse=", "), "), phaseIndex= ", phaseIndex, ", meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, ", theta= ", theta, ") * pHetPairsAtDistances(windowDistances= ", paste(windowDistances, collapse=", "), ", theta= ", theta, ")", sep=""))
|
||||
|
||||
p
|
||||
}
|
||||
|
||||
# Probability (over locations of sites on reads, fragment sizes, and read depths) that paired-end reads can TRANSITIVELY phase phaseIndex relative to phaseIndex - 1, given a window of n het sites at distances distributed as determined by theta (where an edge in the transitive path requires at least nReadsToPhase reads):
|
||||
pDirectlyPhaseHetPairUsingWindow <- function(meanDepth, nReadsToPhase, L, theta, Fm, Fs, n, phaseIndex) {
|
||||
if (n < 2) {
|
||||
stop("n < 2")
|
||||
}
|
||||
ndim = n-1
|
||||
|
||||
integrandFunction <- function(windowDistances) {pPhaseHetPairAndDistancesUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs, theta)}
|
||||
|
||||
MIN_DISTANCE = 0
|
||||
|
||||
#
|
||||
#MAX_DISTANCE = Inf
|
||||
#
|
||||
MAX_TAIL_PROB = 10^-6
|
||||
MAX_DISTANCE = 7500 # Only 3e-07 [= 1 - pHetPairLteDistance(7500, 10^-3)] of the het-het pairs are at a distance > 7500
|
||||
while (1 - pHetPairLteDistance(MAX_DISTANCE, theta) > MAX_TAIL_PROB) {
|
||||
MAX_DISTANCE = MAX_DISTANCE * 2
|
||||
}
|
||||
|
||||
lower = as.vector(matrix(data=MIN_DISTANCE, nrow=1, ncol=ndim))
|
||||
upper = as.vector(matrix(data=MAX_DISTANCE, nrow=1, ncol=ndim))
|
||||
|
||||
N = 10^4 * ndim^2
|
||||
high_dimensional_integrate(ndim, lower, upper, integrandFunction, N, DEBUG = TRUE, PRINT_EVERY = 10^2)
|
||||
}
|
||||
|
||||
# Use the simplest version of the Monte Carlo method to integrate over a high-dimensional function:
|
||||
high_dimensional_integrate <- function(ndim, lower, upper, integrandFunction, N = 10^4, DEBUG = FALSE, PRINT_EVERY = 10^3) {
|
||||
rectangularVolume = prod(upper - lower)
|
||||
|
||||
sum = 0
|
||||
for (i in 1:N) {
|
||||
randVals = as.vector(matrix(data = NA, nrow=1, ncol=ndim))
|
||||
for (j in 1:ndim) {
|
||||
randVals[j] = runif(1, min=lower[j], max=upper[j])
|
||||
}
|
||||
#print(randVals)
|
||||
|
||||
evalFuncVal = integrandFunction(randVals)
|
||||
sum = sum + evalFuncVal
|
||||
|
||||
if (DEBUG && (i-1) %% PRINT_EVERY == 0) {
|
||||
estimate = rectangularVolume * (sum / i)
|
||||
print(paste("high_dimensional_integrate: iteration ", i, ", estimate= ", estimate, sep=""))
|
||||
}
|
||||
}
|
||||
rectangularVolume * (sum / N)
|
||||
}
|
||||
|
||||
middleOfWindowIndex <- function(windowSize) {
|
||||
floor(windowSize/2 + 1)
|
||||
}
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
calcPhasingProbsForWindowDistances <- function(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME = NULL) {
|
||||
WINDOW_SIZES = 2:MAX_WINDOW_SIZE
|
||||
|
||||
phaseProbsPositionWindow = matrix(data = NA, nrow=length(distances), ncol=length(WINDOW_SIZES))
|
||||
|
||||
for (i in 1:length(distances)) {
|
||||
# Try to phase (i+1)-st position [relative to i] using varying window sizes:
|
||||
for (j in 1:length(WINDOW_SIZES)) {
|
||||
windowSize = WINDOW_SIZES[j]
|
||||
remainingSize = windowSize - 2 # exlcude i, i+1
|
||||
|
||||
numOnLeft = i - 1
|
||||
numOnRight = (length(distances) + 1) - (i + 2) + 1
|
||||
|
||||
if (numOnLeft <= numOnRight) {
|
||||
halfToUse = floor(remainingSize / 2) # skimp on the left [floor], and be generous with the right side
|
||||
useOnLeft = min(halfToUse, numOnLeft)
|
||||
useOnRight = min(remainingSize - useOnLeft, numOnRight)
|
||||
}
|
||||
else {
|
||||
halfToUse = ceiling(remainingSize / 2) # be generous with the right side [ceiling]
|
||||
useOnRight = min(halfToUse, numOnRight)
|
||||
useOnLeft = min(remainingSize - useOnRight, numOnLeft)
|
||||
}
|
||||
startInd = i - useOnLeft # go left from position i
|
||||
stopInd = i + 1 + useOnRight # go right from position i + 1
|
||||
|
||||
usePositionRange = seq(from=startInd, to=stopInd, by=1)
|
||||
useDistancesRange = seq(from=startInd, to=stopInd-1, by=1) # since there are N-1 distances between N consecutive positions
|
||||
|
||||
phaseIndex = which(usePositionRange == i+1)
|
||||
if (length(phaseIndex) != 1) stop("NO phaseIndex!")
|
||||
windowDistances = distances[useDistancesRange]
|
||||
|
||||
print(paste("Try to phase position ", i+1, " [relative to ", i, "] using positions: (", paste(usePositionRange, collapse=", "), "), windowDistances= (", paste(windowDistances, collapse=", "), "), [phaseIndex= ", phaseIndex, ", i=", i, "]", sep=""))
|
||||
p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs)
|
||||
print(paste("phase prob: ", p, sep=""))
|
||||
phaseProbsPositionWindow[i, j] = p
|
||||
}
|
||||
|
||||
if (!is.null(FILE_NAME)) {
|
||||
save(list = ls(all=TRUE), file = paste(FILE_NAME, ".RData", sep=""))
|
||||
}
|
||||
}
|
||||
|
||||
list(phaseProbsPositionWindow=phaseProbsPositionWindow, WINDOW_SIZES=WINDOW_SIZES)
|
||||
}
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
#
|
||||
#options(warn=2)
|
||||
#options(error=recover)
|
||||
#
|
||||
|
||||
HALF = high_dimensional_integrate(1, -200, 0, dnorm)
|
||||
print(paste("Should be ~ HALF: ", HALF, sep=""))
|
||||
|
||||
|
||||
k = 75
|
||||
#theta = 10^-2
|
||||
theta = 10^-3
|
||||
|
||||
p = pHetPairLteDistance(k, theta)
|
||||
print(paste(p, " = pHetPairLteDistance(k= ", k, ", theta= ", theta, ")", sep=""))
|
||||
|
||||
|
||||
L = 76
|
||||
fragmentSize = 452
|
||||
|
||||
|
||||
p = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, fragmentSize, k)
|
||||
print(paste(p, " = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L= ", L, ", fragmentSize= ", fragmentSize, ", k= ", k, ")", sep=""))
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
p = pFragmentSize(300, Fm, Fs)
|
||||
print(paste(p, " = pFragmentSize(300, Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
|
||||
|
||||
|
||||
p = pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs)
|
||||
print(paste(p, " = pFragmentsReadsCanCoverHetPairAtDistance(L= ", L, ", k= ", k, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
|
||||
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
p = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, Fm, Fs)
|
||||
print(paste(p, " = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", k= ", k, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
|
||||
|
||||
|
||||
p = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, theta, Fm, Fs)
|
||||
print(paste(p, " = pDirectlyPhaseHetPair(meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
|
||||
|
||||
|
||||
windowDistances = c(100, 100, 100, 100, 100)
|
||||
phaseIndex = 2
|
||||
p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs)
|
||||
print(paste(p, " = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances= (", paste(windowDistances, collapse=", "), "), phaseIndex= ", phaseIndex, ", meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
|
||||
|
||||
|
||||
|
||||
traceback()
|
||||
warnings()
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
theta = 10^-3
|
||||
params = paste("theta= ", theta, sep="")
|
||||
|
||||
MIN_DIST = 1
|
||||
MAX_DIST = 10^4
|
||||
BY_DIST = 10
|
||||
DISTANCES = seq(from=MIN_DIST, to=MAX_DIST+BY_DIST, by=BY_DIST)
|
||||
freqAtLteDist = pHetPairLteDistance(DISTANCES, theta)
|
||||
|
||||
scatter(DISTANCES, freqAtLteDist, "intraHetDistancesDistrib", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", main=params)
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "intraHetDistancesDistrib.RData")
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
theta = 10^-3
|
||||
|
||||
Fm_BASE = 392 - 2 * 101 # The mean insert size == 190
|
||||
Fs = 44
|
||||
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", Fm_BASE= ", Fm_BASE, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
|
||||
MEAN_DEPTHS = 0:65
|
||||
NUM_DEPTHS = length(MEAN_DEPTHS)
|
||||
|
||||
READ_LENGTHS = c(18, 36, 76, 101, 125, 150, 175, 200, 400, 800, 1000)
|
||||
READ_LENGTHS = rev(READ_LENGTHS)
|
||||
NUM_READ_LENGTHS = length(READ_LENGTHS)
|
||||
|
||||
depthsX = list()
|
||||
depthsY = list()
|
||||
depthsLeg = vector()
|
||||
|
||||
for (i in 1:NUM_READ_LENGTHS) {
|
||||
pPhaseDepth = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_DEPTHS))
|
||||
Fm = Fm_BASE + 2 * READ_LENGTHS[i]
|
||||
for (j in 1:NUM_DEPTHS) {
|
||||
pPhaseDepth[j] = pDirectlyPhaseHetPair(MEAN_DEPTHS[j], nReadsToPhase, READ_LENGTHS[i], theta, Fm, Fs)
|
||||
}
|
||||
depthsX[i] = list(MEAN_DEPTHS)
|
||||
depthsY[i] = list(pPhaseDepth)
|
||||
depthsLeg[i] = paste("L= ", READ_LENGTHS[i], sep="")
|
||||
}
|
||||
|
||||
scatter(depthsX, depthsY, "testDepths", xlab="Mean depth", ylab="Phaseability", main=params, leg=depthsLeg, legPos="topleft", width=14, height=7, type="b")
|
||||
|
||||
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "testDepths.RData")
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
theta = 10^-3
|
||||
|
||||
L = 101
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, sep="")
|
||||
|
||||
|
||||
|
||||
MEAN_SIZES = seq(1,2000,20)
|
||||
STD_SIZES = seq(0,200,5)
|
||||
|
||||
|
||||
testFragments = matrix(nrow=length(MEAN_SIZES), ncol=length(STD_SIZES))
|
||||
for (i in 1:length(MEAN_SIZES)) {
|
||||
test_mean_fragment_size = MEAN_SIZES[i]
|
||||
print(paste("test_mean_fragment_size: ", test_mean_fragment_size, sep=""))
|
||||
for (j in 1:length(STD_SIZES)) {
|
||||
test_std_fragment_size = STD_SIZES[j]
|
||||
print(paste("test_std_fragment_size: ", test_std_fragment_size, sep=""))
|
||||
|
||||
testFragments[i,j] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, theta, test_mean_fragment_size, test_std_fragment_size)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pdf('testFragments.pdf')
|
||||
|
||||
library(gplots)
|
||||
heatmap.2(testFragments, ylab = "Mean fragment size", xlab = "Standard deviation fragment size", labRow = MEAN_SIZES, labCol = STD_SIZES, Rowv = NA, Colv = NA, dendrogram = "none", scale="none", revC = FALSE, density.info="none", trace="none", main=params)
|
||||
|
||||
library(scatterplot3d)
|
||||
xMeans = as.vector(t(matrix(rep.int(MEAN_SIZES, length(STD_SIZES)), ncol = length(STD_SIZES))))
|
||||
yStds = rep.int(STD_SIZES, length(MEAN_SIZES))
|
||||
zPhaseRate = as.vector(t(testFragments))
|
||||
scatterplot3d(xMeans, yStds, zPhaseRate, xlab = "Mean fragment size", ylab = "Standard deviation fragment size", zlab = "Phasing rate", main=params)
|
||||
|
||||
bestCombo = which.max(zPhaseRate)
|
||||
print(paste("For ", params, ", BEST choice gives phaseability of ", zPhaseRate[bestCombo], " using mean fragment = ", xMeans[bestCombo], ", std. fragment = ", yStds[bestCombo], sep = ""))
|
||||
dev.off()
|
||||
|
||||
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "testFragments.RData")
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
L = 101
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
|
||||
MEAN_INTRA_HET_DISTANCES = seq(from=2, to=20002, by=50)
|
||||
THETAS = meanIntraHetDistanceToTheta(MEAN_INTRA_HET_DISTANCES)
|
||||
NUM_THETAS = length(THETAS)
|
||||
|
||||
pPhaseTheta = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_THETAS))
|
||||
for (i in 1:NUM_THETAS) {
|
||||
pPhaseTheta[i] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, THETAS[i], Fm, Fs)
|
||||
}
|
||||
scatter(MEAN_INTRA_HET_DISTANCES, pPhaseTheta, "testIntraHetDistances", xlab="Mean intra-het distance", ylab="Phaseability", main=params, type="b")
|
||||
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "testIntraHetDistances.RData")
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
theta = 10^-3
|
||||
|
||||
Fm_BASE = 392 - 2 * 101 # The mean insert size == 190
|
||||
Fs = 44
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", Fm_BASE= ", Fm_BASE, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
READ_LENGTHS = seq(from=30, to=1000, by=10)
|
||||
NUM_READ_LENGTHS = length(READ_LENGTHS)
|
||||
|
||||
pPhaseReadLength = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_READ_LENGTHS))
|
||||
for (i in 1:NUM_READ_LENGTHS) {
|
||||
Fm = Fm_BASE + 2 * READ_LENGTHS[i]
|
||||
pPhaseReadLength[i] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, READ_LENGTHS[i], theta, Fm, Fs)
|
||||
}
|
||||
scatter(READ_LENGTHS, pPhaseReadLength, "testReadLengths", xlab="Read length", ylab="Phaseability", main=params, type="b")
|
||||
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "testReadLengths.RData")
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
L = 101
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
DISTANCES = 0:1000
|
||||
pPhaseHetPairAtDistWithRead = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, DISTANCES, Fm, Fs)
|
||||
|
||||
scatter(DISTANCES, pPhaseHetPairAtDistWithRead, "testSpecificDistances", xlab="Intra-het distance", ylab="Phaseability", main=params)
|
||||
|
||||
|
||||
|
||||
save(list = ls(all=TRUE), file = "testSpecificDistances.RData")
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
L = 76
|
||||
k = 75
|
||||
params = paste("L= ", L, ", k= ", k, sep="")
|
||||
|
||||
FRAGMENT_SIZES = 0:100 + 2 * L
|
||||
pCoverHetPairWithRead = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, FRAGMENT_SIZES, k)
|
||||
|
||||
scatter(FRAGMENT_SIZES, pCoverHetPairWithRead, "testSpecificFragments", xlab="Fragment size", ylab="Probability of covering het pair", main=params)
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
theta = 10^-3
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
L = 101
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
|
||||
|
||||
#
|
||||
#options(warn=2)
|
||||
#options(error=recover)
|
||||
#
|
||||
|
||||
MAX_WINDOW_SIZE = 10
|
||||
|
||||
WINDOW_SIZES = 2:MAX_WINDOW_SIZE
|
||||
NUM_WINDOW_SIZES = length(WINDOW_SIZES)
|
||||
|
||||
pPhaseWindow = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_WINDOW_SIZES))
|
||||
for (i in 1:NUM_WINDOW_SIZES) {
|
||||
n = WINDOW_SIZES[i]
|
||||
phaseIndex = middleOfWindowIndex(n)
|
||||
pPhaseWindow[i] = pDirectlyPhaseHetPairUsingWindow(meanDepth, nReadsToPhase, L, theta, Fm, Fs, n, phaseIndex)
|
||||
|
||||
save(list = ls(all=TRUE), file = "testWindows.RData")
|
||||
}
|
||||
|
||||
scatter(WINDOW_SIZES, pPhaseWindow, "testWindows", xlab="Window size", ylab="Phaseability", main=params, type="b")
|
||||
|
|
@ -1,28 +0,0 @@
|
|||
L = 101
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
theta = 10^-3
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
MAX_NUM_DISTS = 10^4
|
||||
distances = sampleIntraHetDistances(MAX_NUM_DISTS, theta)
|
||||
print(paste("Using ", MAX_NUM_DISTS, " THEORETICAL distances...", sep=""))
|
||||
|
||||
|
||||
MAX_WINDOW_SIZE = 10
|
||||
FILE_NAME = "theoretical_window"
|
||||
|
||||
phaseWindowResult = calcPhasingProbsForWindowDistances(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME)
|
||||
phaseProbsPositionWindow = phaseWindowResult$phaseProbsPositionWindow
|
||||
WINDOW_SIZES = phaseWindowResult$WINDOW_SIZES
|
||||
|
||||
phaseProbsWindow = colMeans(phaseProbsPositionWindow)
|
||||
|
||||
scatter(WINDOW_SIZES, phaseProbsWindow, FILE_NAME, xlab="Window size", ylab="Mean theoretical phasing rate on empirical distances", main=params, type="b")
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
L = 101
|
||||
|
||||
Fm = 392
|
||||
Fs = 44
|
||||
|
||||
meanDepth = 65
|
||||
nReadsToPhase = 1
|
||||
|
||||
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
|
||||
|
||||
|
||||
distances = scan("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt", what=list(dist=0))
|
||||
distances = distances$dist
|
||||
|
||||
MAX_NUM_DISTS = 10^4
|
||||
NUM_DISTS_TO_USE = min(MAX_NUM_DISTS, length(distances))
|
||||
distances = distances[1:NUM_DISTS_TO_USE]
|
||||
print(paste("Using ", NUM_DISTS_TO_USE, " EMPIRICAL distances...", sep=""))
|
||||
|
||||
|
||||
MAX_WINDOW_SIZE = 10
|
||||
FILE_NAME = "theoretical_window_on_empirical"
|
||||
|
||||
phaseWindowResult = calcPhasingProbsForWindowDistances(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME)
|
||||
phaseProbsPositionWindow = phaseWindowResult$phaseProbsPositionWindow
|
||||
WINDOW_SIZES = phaseWindowResult$WINDOW_SIZES
|
||||
|
||||
phaseProbsWindow = colMeans(phaseProbsPositionWindow)
|
||||
|
||||
scatter(WINDOW_SIZES, phaseProbsWindow, FILE_NAME, xlab="Window size", ylab="Mean theoretical phasing rate on empirical distances", main=params, type="b")
|
||||
|
|
@ -1,181 +0,0 @@
|
|||
NUM_chr1_HET_SITES = as.integer(system("grep -c 'chr1:' ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list", intern=TRUE))
|
||||
NUM_chr1_PHASEABLE_HET_SITES = NUM_chr1_HET_SITES - 1 # since can't phase the first het site
|
||||
|
||||
|
||||
#
|
||||
#USE_EMPIRICAL_WINDOWS = c(10, 2)
|
||||
#
|
||||
USE_EMPIRICAL_WINDOWS = c(2)
|
||||
|
||||
|
||||
TWO_COLORS = c("red", "darkgreen")
|
||||
|
||||
|
||||
######################################################################
|
||||
# Phasing as a function of SPECIFIC intra-het distances:
|
||||
######################################################################
|
||||
load("testSpecificDistances.RData")
|
||||
|
||||
MAX_DISTANCE = 10^3
|
||||
PQ_PHASING_THRESH = 10.0
|
||||
|
||||
distances = list()
|
||||
phaseRateDistances = list()
|
||||
distancesLeg = vector()
|
||||
|
||||
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
|
||||
n = USE_EMPIRICAL_WINDOWS[nextIndex]
|
||||
n_locDistancePQReadsWindow <- scan(paste("~fromer/storage/phase.NA12878/phase_all_chr.n_", n, ".NA12878", ".locus_distance_PQ_numReads_windowSize.txt", sep=""), what=list(loci="", distance=0, PQ=0, reads=0, window=0))
|
||||
n_distance <- n_locDistancePQReadsWindow$distance
|
||||
n_PQ <- n_locDistancePQReadsWindow$PQ
|
||||
|
||||
distanceVector = sort(unique(n_distance))
|
||||
distanceVector = distanceVector[which(distanceVector <= MAX_DISTANCE)]
|
||||
numDists = length(distanceVector)
|
||||
|
||||
phasedFractionVector = as.vector(matrix(data=-1, nrow=1, ncol=numDists))
|
||||
|
||||
print(paste("numDists= ", numDists, sep=""))
|
||||
print(paste(distanceVector, collapse=", "))
|
||||
|
||||
for (i in 1:numDists) {
|
||||
d = distanceVector[i]
|
||||
print(paste("d= ", d, sep=""))
|
||||
|
||||
dInds = which(n_distance == d)
|
||||
phasedFractionVector[i] = length(which(n_PQ[dInds] >= PQ_PHASING_THRESH)) / length(dInds)
|
||||
}
|
||||
|
||||
distances[nextIndex] = list(distanceVector)
|
||||
phaseRateDistances[nextIndex] = list(phasedFractionVector)
|
||||
distancesLeg[nextIndex] = paste("HiSeq (window = ", n, ")", sep="")
|
||||
}
|
||||
|
||||
nextIndex = nextIndex+1
|
||||
distances[nextIndex] = list(DISTANCES)
|
||||
phaseRateDistances[nextIndex] = list(pPhaseHetPairAtDistWithRead)
|
||||
distancesLeg[nextIndex] = "Theoretical (window = 2)" # params
|
||||
|
||||
scatter(distances, phaseRateDistances, "specific_distances.theoretical_empirical", xlab="Intra-het distance", ylab="Phaseability", leg=distancesLeg, legPos="topright", width=14, height=7, type="b", col=TWO_COLORS)
|
||||
|
||||
|
||||
|
||||
######################################################################
|
||||
# Phasing as a function of depth:
|
||||
######################################################################
|
||||
load("testDepths.RData")
|
||||
|
||||
depths = list()
|
||||
phaseRateDepths = list()
|
||||
depthsLeg = vector()
|
||||
|
||||
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
|
||||
n = USE_EMPIRICAL_WINDOWS[nextIndex]
|
||||
RGdocPhasedConsistentSwitch = scan(paste("~fromer/storage/downsampled_phasing.NA12878.HiSeq/RG.DoC_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(RGdoc=0, phased=0, consistentPhased=0, switch=0.0))
|
||||
depths[nextIndex] = list(RGdocPhasedConsistentSwitch$RGdoc)
|
||||
phaseRateDepths[nextIndex] = list(RGdocPhasedConsistentSwitch$phased / NUM_chr1_PHASEABLE_HET_SITES)
|
||||
depthsLeg[nextIndex] = paste("Down-sampled HiSeq (window = ", n, ")", sep="")
|
||||
}
|
||||
|
||||
nextIndex = nextIndex+1
|
||||
useLength = which(READ_LENGTHS == 101)
|
||||
depths[nextIndex] = depthsX[useLength]
|
||||
phaseRateDepths[nextIndex] = depthsY[useLength]
|
||||
depthsLeg[nextIndex] = "Theoretical (window = 2)" # params
|
||||
|
||||
scatter(depths, phaseRateDepths, "depths.theoretical_empirical", xlab="Mean depth", ylab="Phaseability", leg=depthsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||||
|
||||
|
||||
|
||||
######################################################################
|
||||
# Distribution of intra-het distances:
|
||||
######################################################################
|
||||
load("intraHetDistancesDistrib.RData")
|
||||
|
||||
empiricalIntraHetDistances = read.table("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt")$V1
|
||||
empiricalIntraHetDistances[which(empiricalIntraHetDistances >= MAX_DIST)] = MAX_DIST
|
||||
|
||||
empiricalIntraHetDistancesHist = hist(empiricalIntraHetDistances, breaks=DISTANCES, plot=FALSE)
|
||||
empiricalIntraHetDistancesCumulativeFrequencies = cumsum(empiricalIntraHetDistancesHist$counts) / length(empiricalIntraHetDistances)
|
||||
|
||||
scatter(list(empiricalIntraHetDistancesHist$mids, DISTANCES), list(empiricalIntraHetDistancesCumulativeFrequencies, freqAtLteDist), "intraHetDistancesDistrib.theoretical_empirical", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", leg=c("NA12878 HiSeq", "Theoretical"), legPos="topleft", type="b", col=TWO_COLORS)
|
||||
|
||||
|
||||
|
||||
######################################################################
|
||||
# Phasing as a function of MEAN intra-het distance:
|
||||
######################################################################
|
||||
load("testIntraHetDistances.RData")
|
||||
|
||||
hetDistances = list()
|
||||
phaseRateHetDistances = list()
|
||||
hetDistancesLeg = vector()
|
||||
|
||||
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
|
||||
n = USE_EMPIRICAL_WINDOWS[nextIndex]
|
||||
meanHetDistNumSitesPhasedConsistentSwitch = scan(paste("~fromer/storage/remove_het_sites.NA12878.HiSeq/meanHetDist_numSites_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(meanHetDist=0.0, numSites=0, phased=0, consistentPhased=0, switch=0.0))
|
||||
|
||||
hetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$meanHetDist)
|
||||
phaseRateHetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$phased)
|
||||
hetDistancesLeg[nextIndex] = paste("Removed hets from HiSeq (window = ", n, ")", sep="")
|
||||
}
|
||||
|
||||
nextIndex = nextIndex+1
|
||||
hetDistances[nextIndex] = list(MEAN_INTRA_HET_DISTANCES)
|
||||
phaseRateHetDistances[nextIndex] = list(pPhaseTheta)
|
||||
hetDistancesLeg[nextIndex] = "Theoretical (window = 2)" # params
|
||||
|
||||
scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS)
|
||||
|
||||
scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.log.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS, log="y", xlim=c(1, 20000))
|
||||
|
||||
|
||||
######################################################################
|
||||
# Phasing as a function of window size:
|
||||
######################################################################
|
||||
load("theoretical_window_on_empirical.RData")
|
||||
|
||||
windows = list()
|
||||
phaseRateWindows = list()
|
||||
windowsLeg = vector()
|
||||
|
||||
NUM_HET_SITES = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | wc -l", intern=TRUE))
|
||||
NUM_CHR = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | cut -f1 -d':' | sort | uniq | wc -l", intern=TRUE))
|
||||
NUM_PHASEABLE_HET_SITES = NUM_HET_SITES - NUM_CHR # since can't phase the first het site of each chromosome
|
||||
|
||||
|
||||
windowPhasedConsistent = scan(paste("~fromer/storage/phase.NA12878/window_phased_consistent.txt", sep=""), what=list(window=0, phased=0, consistentPhased=0))
|
||||
windows[1] = list(windowPhasedConsistent$window)
|
||||
phaseRateWindows[1] = list(windowPhasedConsistent$phased / NUM_PHASEABLE_HET_SITES)
|
||||
windowsLeg[1] = paste("HiSeq", sep="")
|
||||
|
||||
|
||||
windows[2] = list(WINDOW_SIZES)
|
||||
phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow)))
|
||||
windowsLeg[2] = "Theoretical" # params
|
||||
|
||||
scatter(windows, phaseRateWindows, "windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||||
|
||||
|
||||
|
||||
# Use numerical integration over theoretical distances distribution:
|
||||
load("testWindows.RData")
|
||||
|
||||
doneInds = which(pPhaseWindow != -1)
|
||||
|
||||
windows[2] = list(WINDOW_SIZES[doneInds])
|
||||
phaseRateWindows[2] = list(pPhaseWindow[doneInds])
|
||||
windowsLeg[2] = "Theoretical" # params
|
||||
|
||||
scatter(windows, phaseRateWindows, "theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||||
|
||||
|
||||
|
||||
# Use theoretical sampling of distances:
|
||||
load("theoretical_window.RData")
|
||||
|
||||
windows[2] = list(WINDOW_SIZES)
|
||||
phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow)))
|
||||
windowsLeg[2] = "Theoretical" # params
|
||||
|
||||
scatter(windows, phaseRateWindows, "sampled_theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
|
||||
|
|
@ -1,190 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
annotationName = args[2]
|
||||
minBinCutoff = as.numeric(args[3])
|
||||
medianNumVariants = args[4]
|
||||
|
||||
c <- read.table(input, header=T)
|
||||
|
||||
all = c[c$numVariants>minBinCutoff & c$category=="all",]
|
||||
novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
|
||||
dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
|
||||
truth = c[c$numVariants>minBinCutoff & c$category=="truth",]
|
||||
|
||||
#
|
||||
# Calculate min, max, medians
|
||||
#
|
||||
|
||||
d = c[c$numVariants>minBinCutoff,]
|
||||
ymin = min(d$titv)
|
||||
ymax = max(d$titv)
|
||||
xmin = min(d$value)
|
||||
xmax = max(d$value)
|
||||
m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
|
||||
ma = all[all$value > m,]
|
||||
mb = all[all$value < m,]
|
||||
m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
|
||||
m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
|
||||
if(medianNumVariants == "true") {
|
||||
vc = cumsum( all$numVariants/sum(all$numVariants) )
|
||||
m10 = all$value[ max(which(vc<=0.10)) ]
|
||||
m25 = all$value[ max(which(vc<=0.25)) ]
|
||||
m = all$value[ max(which(vc<=0.5)) ]
|
||||
m75 = all$value[ min(which(vc>=0.75)) ]
|
||||
m90 = all$value[ min(which(vc>=0.90)) ]
|
||||
}
|
||||
|
||||
#
|
||||
# Plot TiTv ratio as a function of the annotation
|
||||
#
|
||||
|
||||
outfile = paste(input, ".TiTv.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
abline(v=m,lty=2,col="red")
|
||||
abline(v=m75,lty=3)
|
||||
abline(v=m25,lty=3)
|
||||
text(m, ymin, "50", col="red", cex=0.6);
|
||||
text(m75, ymin, "75", col="black", cex=0.6);
|
||||
text(m25, ymin, "25", col="black", cex=0.6);
|
||||
if(medianNumVariants == "true") {
|
||||
abline(v=m90,lty=3)
|
||||
abline(v=m10,lty=3)
|
||||
text(m10, ymin, "10", col="black", cex=0.6);
|
||||
text(m90, ymin, "90", col="black", cex=0.6);
|
||||
}
|
||||
points(novel$value,novel$titv,col="green",pch=20)
|
||||
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
points(truth$value,truth$titv,col="magenta",pch=20)
|
||||
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
|
||||
} else {
|
||||
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot TiTv ratio as a function of the annotation, log scale on the x-axis
|
||||
#
|
||||
|
||||
outfile = paste(input, ".TiTv_log.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
abline(v=m,lty=2,col="red")
|
||||
abline(v=m75,lty=3)
|
||||
abline(v=m25,lty=3)
|
||||
text(m, ymin, "50", col="red", cex=0.6);
|
||||
text(m75, ymin, "75", col="black", cex=0.6);
|
||||
text(m25, ymin, "25", col="black", cex=0.6);
|
||||
if(medianNumVariants == "true") {
|
||||
abline(v=m90,lty=3)
|
||||
abline(v=m10,lty=3)
|
||||
text(m10, ymin, "10", col="black", cex=0.6);
|
||||
text(m90, ymin, "90", col="black", cex=0.6);
|
||||
}
|
||||
points(novel$value,novel$titv,col="green",pch=20)
|
||||
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
points(truth$value,truth$titv,col="magenta",pch=20)
|
||||
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
|
||||
} else {
|
||||
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot dbsnp and true positive rate as a function of the annotation
|
||||
#
|
||||
|
||||
ymin = min(all$dbsnp)
|
||||
ymax = max(all$dbsnp)
|
||||
outfile = paste(input, ".truthRate.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
yLabel = "DBsnp Rate"
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
t = all[all$truePositive>0,]
|
||||
yLabel = "DBsnp/True Positive Rate"
|
||||
ymin = min(min(all$dbsnp),min(t$truePositive))
|
||||
ymax = max(max(all$dbsnp),max(t$truePositive))
|
||||
}
|
||||
plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
abline(v=m,lty=2,col="red")
|
||||
abline(v=m75,lty=3)
|
||||
abline(v=m25,lty=3)
|
||||
text(m, ymin, "50", col="red", cex=0.6);
|
||||
text(m75, ymin, "75", col="black", cex=0.6);
|
||||
text(m25, ymin, "25", col="black", cex=0.6);
|
||||
if(medianNumVariants == "true") {
|
||||
abline(v=m90,lty=3)
|
||||
abline(v=m10,lty=3)
|
||||
text(m10, ymin, "10", col="black", cex=0.6);
|
||||
text(m90, ymin, "90", col="black", cex=0.6);
|
||||
}
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
points(t$value,t$truePositive,col="magenta",pch=20);
|
||||
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis
|
||||
#
|
||||
|
||||
outfile = paste(input, ".truthRate_log.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
yLabel = "DBsnp Rate"
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
yLabel = "DBsnp/Truth Rate"
|
||||
}
|
||||
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
abline(v=m,lty=2,col="red")
|
||||
abline(v=m75,lty=3)
|
||||
abline(v=m25,lty=3)
|
||||
text(m, ymin, "50", col="red", cex=0.6);
|
||||
text(m75, ymin, "75", col="black", cex=0.6);
|
||||
text(m25, ymin, "25", col="black", cex=0.6);
|
||||
if(medianNumVariants == "true") {
|
||||
abline(v=m90,lty=3)
|
||||
abline(v=m10,lty=3)
|
||||
text(m10, ymin, "10", col="black", cex=0.6);
|
||||
text(m90, ymin, "90", col="black", cex=0.6);
|
||||
}
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
points(t$value,t$truePositive,col="magenta",pch=20);
|
||||
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot histogram of the annotation's value
|
||||
#
|
||||
|
||||
outfile = paste(input, ".Histogram.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot histogram of the annotation's value, log scale on x-axis
|
||||
#
|
||||
|
||||
outfile = paste(input, ".Histogram_log.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
dev.off()
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
annotationName = args[2]
|
||||
|
||||
data = read.table(input,sep=",",head=T)
|
||||
|
||||
outfile = paste(input, ".ClusterReport.pdf", sep="")
|
||||
pdf(outfile, height=7, width=8)
|
||||
|
||||
maxP = max(data$knownDist, data$novelDist)
|
||||
|
||||
plot(data$annotationValue, data$knownDist, ylim=c(0,maxP),type="b",col="orange",lwd=2,xlab=annotationName,ylab="fraction of SNPs")
|
||||
points(data$annotationValue, data$novelDist, type="b",col="blue",lwd=2)
|
||||
legend('topright', c('knowns','novels'),lwd=2,col=c("orange","blue"))
|
||||
dev.off()
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
args = commandArgs(TRUE);
|
||||
|
||||
RUNME = F
|
||||
onCMDLine = ! is.na(args[1])
|
||||
DATA_FILE = args[1]
|
||||
DESCRIPTION = args[2]
|
||||
#OUTPUT_PDF = paste(DATA_FILE, ".pdf", sep="")
|
||||
|
||||
MAX_POINTS = 100000
|
||||
|
||||
if ( onCMDLine ) {
|
||||
print(paste("Reading data from", DATA_FILE))
|
||||
d = read.table(DATA_FILE, header=T)
|
||||
}
|
||||
|
||||
#if ( onCMDLine ) pdf(OUTPUT_PDF)
|
||||
|
||||
vec.margin <- function(x) {
|
||||
l = length(x)
|
||||
d = x[-1] - x[1:(l-1)]
|
||||
c(x[1], d[1:(l-1)])
|
||||
}
|
||||
|
||||
everyNth <- function(x, n) {
|
||||
l = dim(x)[1]
|
||||
m = ceiling(l / n)
|
||||
print(m)
|
||||
keep = 1:l %% m == 0
|
||||
x[keep,]
|
||||
}
|
||||
|
||||
l = length(d$units.processed)
|
||||
d$units.processed.margin = vec.margin(d$units.processed)
|
||||
#prev = 0
|
||||
#for ( i in 1:l ) {
|
||||
# cur = d$units.processed[i]
|
||||
# d[i,]$units.processed.margin = cur - prev
|
||||
# prev = cur
|
||||
#}
|
||||
|
||||
generateOneReport <- function(d) {
|
||||
qs = quantile(d$processing.speed, probs = c(0.01, 0.5, 0.99))
|
||||
|
||||
# unit processing time
|
||||
if ( onCMDLine ) png(paste(DATA_FILE, ".speed.png", sep=""), width=1080, height=1080)
|
||||
dpoints = everyNth(d, MAX_POINTS)
|
||||
plot(dpoints$elapsed.time, dpoints$processing.speed, main=DESCRIPTION, xlab="Elapsed time (sec)", ylab="Processing speed (seconds per 1M units)", ylim=c(qs[1], qs[3]), type="b", col="cornflowerblue", lwd=2)
|
||||
abline(h=qs[2], lty=2)
|
||||
if ( onCMDLine ) dev.off()
|
||||
|
||||
# instantaneous processing speed
|
||||
if ( onCMDLine ) png(paste(DATA_FILE, ".marginal.png", sep=""), width=1080, height=1080)
|
||||
running_median_window = 101
|
||||
rm = runmed(d$units.processed.margin, running_median_window)
|
||||
POINT_COL = "#0000AA99"
|
||||
plot(dpoints$elapsed.time, dpoints$units.processed.margin, main=DESCRIPTION, xlab="Elapsed time (sec)", ylab="Units processed in last timing interval", type="p", cex = 0.75, col=POINT_COL)
|
||||
lines(d$elapsed.time, rm, lwd=3, col="red")
|
||||
legend("topleft", c("Observations", "101-elt running median"), fill=c(POINT_COL, "red"))
|
||||
if ( onCMDLine ) dev.off()
|
||||
}
|
||||
|
||||
if ( RUNME ) {
|
||||
generateOneReport(d)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,36 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
targetTITV = as.numeric(args[2])
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
# optimization curve
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
data = read.table(input,sep=",",head=T)
|
||||
maxVars = max(data$numKnown, data$numNovel)
|
||||
maxTITV = max(data$knownTITV[is.finite(data$knownTITV) & data$numKnown>2000], data$novelTITV[is.finite(data$novelTITV) & data$numNovel > 2000], targetTITV)
|
||||
maxTITV = min(maxTITV, targetTITV + 1)
|
||||
minTITV = min(data$knownTITV[length(data$knownTITV)], data$novelTITV[length(data$novelTITV)], targetTITV)
|
||||
maxPCut = max(data$pCut[data$numKnown>0 | data$numNovel>0])
|
||||
|
||||
outfile = paste(input, ".optimizationCurve.pdf", sep="")
|
||||
pdf(outfile, height=7, width=8)
|
||||
|
||||
par(mar=c(4,4,1,4),cex=1.3)
|
||||
plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with QUAL >= X",ylab="",ylim=c(minTITV,maxTITV),xlim=c(0,maxPCut),col="Blue",pch=20)
|
||||
points(data$pCut, data$novelTITV,,col="DarkBlue",pch=20)
|
||||
abline(h=targetTITV,lty=3,col="Blue")
|
||||
axis(side=2,col="DarkBlue")
|
||||
axis(side=1)
|
||||
mtext("Ti/Tv Ratio", side=2, line=2, col="blue",cex=1.4)
|
||||
legend("left", c("Known Ti/Tv","Novel Ti/Tv"), col=c("Blue","DarkBlue"), pch=c(20,20),cex=0.7)
|
||||
par(new=T)
|
||||
plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),xlim=c(0,maxPCut),col="Green",pch=20)
|
||||
points(data$pCut, data$numNovel,col="DarkGreen",pch=20)
|
||||
axis(side=4,col="DarkGreen")
|
||||
mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4)
|
||||
legend("topright", c("Known","Novel"), col=c("Green","DarkGreen"), pch=c(20,20),cex=0.7)
|
||||
dev.off()
|
||||
|
|
@ -1,87 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
tranchesFile = args[1]
|
||||
targetTITV = as.numeric(args[2])
|
||||
targetSensitivity = as.numeric(args[3])
|
||||
suppressLegend = ! is.na(args[4])
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
# Useful general routines
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
|
||||
MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate
|
||||
|
||||
titvFPEst <- function(titvExpected, titvObserved) {
|
||||
max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE)
|
||||
}
|
||||
|
||||
titvFPEstV <- function(titvExpected, titvs) {
|
||||
sapply(titvs, function(x) titvFPEst(titvExpected, x))
|
||||
}
|
||||
|
||||
nTPFP <- function(nVariants, FDR) {
|
||||
return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100)))
|
||||
}
|
||||
|
||||
leftShift <- function(x, leftValue = 0) {
|
||||
r = rep(leftValue, length(x))
|
||||
for ( i in 1:(length(x)-1) ) {
|
||||
#print(list(i=i))
|
||||
r[i] = x[i+1]
|
||||
}
|
||||
r
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
# Tranches plot
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
data2 = read.table(tranchesFile,sep=",",head=T)
|
||||
data2 = data2[order(data2$novelTiTv, decreasing=F),]
|
||||
#data2 = data2[order(data2$FDRtranche, decreasing=T),]
|
||||
cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
|
||||
density=c(20, -1, -1, 20)
|
||||
outfile = paste(tranchesFile, ".pdf", sep="")
|
||||
pdf(outfile, height=5, width=8)
|
||||
par(mar = c(5, 5, 4, 2) + 0.1)
|
||||
novelTiTv = c(data2$novelTITV,data2$novelTiTv)
|
||||
alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
|
||||
#print(alpha)
|
||||
|
||||
numGood = round(alpha * data2$numNovel);
|
||||
|
||||
#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
|
||||
numBad = data2$numNovel - numGood;
|
||||
|
||||
numPrevGood = leftShift(numGood, 0)
|
||||
numNewGood = numGood - numPrevGood
|
||||
numPrevBad = leftShift(numBad, 0)
|
||||
numNewBad = numBad - numPrevBad
|
||||
|
||||
d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE)
|
||||
#print(d)
|
||||
barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000))
|
||||
#abline(v= d[2,dim(d)[2]], lty=2)
|
||||
#abline(v= d[1,3], lty=2)
|
||||
if ( ! suppressLegend )
|
||||
legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
|
||||
|
||||
mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
|
||||
mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
|
||||
axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
|
||||
axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
|
||||
|
||||
# plot sensitivity vs. specificity
|
||||
sensitivity = data2$truthSensitivity
|
||||
if ( ! is.null(sensitivity) ) {
|
||||
#specificity = titvFPEstV(targetTITV, novelTiTv)
|
||||
specificity = novelTiTv
|
||||
plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)")
|
||||
abline(h=targetTITV, lty=2)
|
||||
abline(v=targetSensitivity, lty=2)
|
||||
#text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2)
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
covariateName = args[2]
|
||||
|
||||
outfile = paste(input, ".indelQual_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
c <- read.table(input, header=T)
|
||||
c <- c[sort.list(c[,1]),]
|
||||
|
||||
#
|
||||
# Plot qual as a function of the covariate
|
||||
#
|
||||
|
||||
d.good <- c[c$nBases >= 1000,]
|
||||
d.1000 <- c[c$nBases < 1000,]
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(d.good$nBases) == length(c$nBases) ) {
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3))
|
||||
}
|
||||
# Don't let residual error go off the edge of the plot
|
||||
d.good$residualError = d.good$Qempirical#-d.good$Qreported
|
||||
#d.good$residualError[which(d.good$residualError > 10)] = 10
|
||||
#d.good$residualError[which(d.good$residualError < -10)] = -10
|
||||
d.1000$residualError = d.1000$Qempirical#-d.1000$Qreported
|
||||
#d.1000$residualError[which(d.1000$residualError > 10)] = 10
|
||||
#d.1000$residualError[which(d.1000$residualError < -10)] = -10
|
||||
c$residualError = c$Qempirical
|
||||
#c$residualError[which(c$residualError > 10)] = 10
|
||||
#c$residualError[which(c$residualError < -10)] = -10
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical Indel Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-0, 50), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical Indel Quality", xlab=covariateName, col="blue", ylim=c(0, 50))
|
||||
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
|
||||
#
|
||||
# Plot mean quality versus the covariate
|
||||
#
|
||||
|
||||
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
theTitle = paste("Quality By", covariateName);
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot histogram of the covariate
|
||||
#
|
||||
|
||||
e = d.good
|
||||
f = d.1000
|
||||
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
|
||||
|
||||
lwdSize=2
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
lwdSize=7
|
||||
} else if( length(c$Covariate) <= 70 ) {
|
||||
lwdSize=4
|
||||
}
|
||||
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
if( length(hst$e.Covariate) == 0 ) {
|
||||
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
} else {
|
||||
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
|
||||
}
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
|
||||
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
|
||||
if( length(hst$c.Covariate) > 9 ) {
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
|
||||
} else {
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
|
||||
}
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
}
|
||||
dev.off()
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
covariateName = args[2]
|
||||
|
||||
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
c <- read.table(input, header=T)
|
||||
c <- c[sort.list(c[,1]),]
|
||||
|
||||
#
|
||||
# Plot residual error as a function of the covariate
|
||||
#
|
||||
|
||||
d.good <- c[c$nBases >= 1000,]
|
||||
d.1000 <- c[c$nBases < 1000,]
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(d.good$nBases) == length(c$nBases) ) {
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3))
|
||||
}
|
||||
# Don't let residual error go off the edge of the plot
|
||||
d.good$residualError = d.good$Qempirical-d.good$Qreported
|
||||
d.good$residualError[which(d.good$residualError > 10)] = 10
|
||||
d.good$residualError[which(d.good$residualError < -10)] = -10
|
||||
d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
|
||||
d.1000$residualError[which(d.1000$residualError > 10)] = 10
|
||||
d.1000$residualError[which(d.1000$residualError < -10)] = -10
|
||||
c$residualError = c$Qempirical-c$Qreported
|
||||
c$residualError[which(c$residualError > 10)] = 10
|
||||
c$residualError[which(c$residualError < -10)] = -10
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
|
||||
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
|
||||
#
|
||||
# Plot mean quality versus the covariate
|
||||
#
|
||||
|
||||
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
theTitle = paste("Quality By", covariateName);
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot histogram of the covariate
|
||||
#
|
||||
|
||||
e = d.good
|
||||
f = d.1000
|
||||
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
|
||||
|
||||
lwdSize=2
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
lwdSize=7
|
||||
} else if( length(c$Covariate) <= 70 ) {
|
||||
lwdSize=4
|
||||
}
|
||||
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
if( length(hst$e.Covariate) == 0 ) {
|
||||
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
} else {
|
||||
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
|
||||
}
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
|
||||
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
|
||||
if( length(hst$c.Covariate) > 9 ) {
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
|
||||
} else {
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
|
||||
}
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
}
|
||||
dev.off()
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
input = args[1]
|
||||
Qcutoff = as.numeric(args[2])
|
||||
maxQ = as.numeric(args[3])
|
||||
maxHist = as.numeric(args[4])
|
||||
|
||||
t=read.table(input, header=T)
|
||||
|
||||
#
|
||||
# Plot of reported quality versus empirical quality
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
|
||||
d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
|
||||
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
|
||||
f <- t[t$Qreported < Qcutoff,]
|
||||
e <- rbind(d.good, d.1000, d.10000)
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3));
|
||||
}
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
|
||||
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
|
||||
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
|
||||
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
|
||||
abline(0,1, lty=2)
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot Q empirical histogram
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
|
||||
percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
|
||||
entropy = -sum(log2(percentBases)*percentBases)
|
||||
yMax = max(hst$e.nBases)
|
||||
if(maxHist != 0) {
|
||||
yMax = maxHist
|
||||
}
|
||||
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
|
||||
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot Q reported histogram
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
|
||||
yMax = max(hst$e.nBases)
|
||||
if(maxHist != 0) {
|
||||
yMax = maxHist
|
||||
}
|
||||
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
|
||||
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
|
||||
data = read.table(input,sep=",",head=T)
|
||||
numCurves = (length(data) - 1)/3
|
||||
maxSpec = max(data[,(1:numCurves)*3])
|
||||
|
||||
outfile = paste(input, ".variantROCCurve.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
|
||||
par(cex=1.3)
|
||||
plot(data$specificity1,data$sensitivity1, type="n", xlim=c(0,maxSpec),ylim=c(0,1),xlab="1 - Specificity",ylab="Sensitivity")
|
||||
for(iii in 1:numCurves) {
|
||||
points(data[,iii*3],data[,(iii-1)*3+2],lwd=3,type="l",col=iii)
|
||||
}
|
||||
legend("bottomright", names(data)[(0:(numCurves-1))*3+1], col=1:numCurves,lwd=3)
|
||||
dev.off()
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
fileToRead <- args[1]
|
||||
functionToRun <- args[2]
|
||||
functionSpecificArgs <- args[3]
|
||||
|
||||
## load the function to run
|
||||
|
||||
if ( funtionToRun == "PlotInterleavedRows" ) {
|
||||
### PLOT INTERLEAVED ROWS FUNCTION ###
|
||||
# - expects a file of the form
|
||||
#
|
||||
# sample_a \t 0.8 \t 0.6 \t 0.5
|
||||
# sample_a \t 0 \t 1 \t 3
|
||||
# sample_b \t 0.5 \t 0.3 \t 0.1
|
||||
# sample_b \t 1 \t 2 \t 4
|
||||
#
|
||||
# and an argument string
|
||||
# x_label;y_label;plot_title;base_name_for_pdf
|
||||
# - end of info -
|
||||
### PLOT INTERLEAVED ROWS FUNCTION ###
|
||||
PlotInterleavedRows <- function(inFile,args) {
|
||||
arglist = unlist(strsplit(args,";"))
|
||||
xlabel = arglist[1]
|
||||
ylabel = arglist[2]
|
||||
title = arglist[3]
|
||||
outFileBase = arglist[4]
|
||||
|
||||
allPoints <- as.matrix(read.table(inFile))
|
||||
# set up colors
|
||||
colors = rainbow(ncol(allPoints)-1,s=0.8,v=0.8,gamma=0.6,start=0.0,end=0.9)
|
||||
styles = c(rep(1,ncol(allPoints)-1))
|
||||
evalPoints = matrix(nrow=nrow(allPoints)/2,ncol=ncol(allPoints))
|
||||
funcVal = matrix(nrow=nrow(allPoints)/2,ncol=ncol(allPoints))
|
||||
# convert to two matrices by de-interleaving and transposing
|
||||
for ( i in 1:(nrow(allPoints)/2) ) {
|
||||
evalPoints[i,] <- allPoints[2*i,]
|
||||
funcVal[i,] <- allPoints[2*i-1,]
|
||||
}
|
||||
|
||||
evalPoints <- t(evalPoints)
|
||||
funcVal <- t(funcVal)
|
||||
# plot and put legend on
|
||||
pdf(paste(outFileBase,"_rplot",".pdf",sep=""))
|
||||
matplot(evalPoints,funcVal,col=colors,lty=styles,"l",xlab=xlabel,ylab=ylabel)
|
||||
legend("topright",funcVal[1,],lty=styles,col=colors)
|
||||
title(main=title,outer=TRUE)
|
||||
# save
|
||||
dev.off()
|
||||
}
|
||||
|
||||
PlotInterleavedRows(fileToRead,functionSpecificArgs)
|
||||
|
||||
}
|
||||
|
||||
if ( functionToRun == "PlotHeatmap" ) {
|
||||
### PLOT HEATMAP FUNCTION ###
|
||||
#
|
||||
# Normally what is meant by "heatmap" is just an image() of the
|
||||
# matrix; in accordance with that, THIS FUNCTION DOES NOT COMPUTE
|
||||
# DENDROGRAMS THROUGH HEATMAP(), so no rows and columns are not
|
||||
# re-ordered, and dendrograms are not displayed.
|
||||
#
|
||||
# - expects a file of the form
|
||||
#
|
||||
# rentry1 \t rentry2 \t rentry3 \t ...
|
||||
# colentry1 \t 0.7 \t 0.9 \t 0.4 \t ...
|
||||
# colentry2 \t 0.8 \t 0.7 \t 0.6 \t ...
|
||||
# ...
|
||||
# Note that the rows and columns don't line up. R understands this
|
||||
# and deals with it.
|
||||
# Also expects an argument string:
|
||||
# row_label;column_label;plot_title;base_name_for_pdf
|
||||
# - end of info -
|
||||
### PLOT HEATMAP FUNCTION ###
|
||||
PlotHeatmap <- function(inFile,args) {
|
||||
arglist = unlist(strsplit(args,split=";"))
|
||||
row_label = arglist[1]
|
||||
column_label = arglist[2]
|
||||
data_rescale_factor <- as.numeric(arglist[3])
|
||||
plot_title = arglist[4]
|
||||
base_name_for_pdf = arglist[5]
|
||||
image_matrix <- as.matrix(read.table(inFile))
|
||||
## change default colors to include "cool" colors for lower end of spectrum
|
||||
## e.g. red ~ near 1, yellow ~ near .75, green ~ near .5, teal ~ near .25
|
||||
## blue ~ near 0
|
||||
colors <- rev(rainbow(32,start=0,end=0.6,s=0.9,v=0.9,gamma=0.8))
|
||||
pdf(paste(base_name_for_pdf,"_rplot",".pdf",sep=""))
|
||||
heatmap(image_matrix,Rowv=NA,Colv=NA,ylab=row_label,xlab=column_label,col=colors)
|
||||
title(main=plot_title,outer=TRUE)
|
||||
dev.off()
|
||||
}
|
||||
|
||||
PlotHeatmap(fileToRead,functionSpecificArgs)
|
||||
|
||||
}
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
MAX_AC = 10000
|
||||
normHist <- function(d, m) {
|
||||
x = hist(d$true.ac, breaks=1:20000, plot=F)$counts[1:MAX_AC]
|
||||
x / sum(x)
|
||||
}
|
||||
|
||||
f <- function(d, acs) {
|
||||
cols = rainbow(length(acs), alpha=0.75)
|
||||
y = normHist(subset(afs, small.ac == acs[1]))
|
||||
x = 1:length(y) / max(d$true.an)
|
||||
plot(x, y, type="l", col=cols[1], xlab="True MAF in full population", ylab="Frequency", lwd=3, log="x")
|
||||
for (i in 2:length(acs)) {
|
||||
points(x, normHist(subset(afs, small.ac == acs[i])), type="l", col=cols[i], lwd=3)
|
||||
}
|
||||
|
||||
legend("topright", legend=lapply(acs, function(x) paste("AC =", x)), fill=cols, title="Sub-population")
|
||||
}
|
||||
|
||||
expected <- function(maxAN, N, eps, ac1scale = F) {
|
||||
scale = 10
|
||||
|
||||
f <- function(ps, N) {
|
||||
co = 2 * N / ( 1 - eps )
|
||||
co * ((1 - ps)/(1-eps))^(2 * N - 1)
|
||||
}
|
||||
|
||||
# these are the points that we'll actually show, but we need to do the calculation
|
||||
# special for the AC = 1 given the equation actually fits an infinite population
|
||||
# not a discrete population with max chromosomes
|
||||
ps = 1:maxAN / maxAN
|
||||
v = f(ps, N)
|
||||
v = v / sum(v)
|
||||
|
||||
if ( ac1scale ) {
|
||||
subps = seq(1, maxAN*scale) / (maxAN * scale)
|
||||
#print(subps)
|
||||
subv = f(subps, N)
|
||||
#print(subv)
|
||||
#print(v[1:10])
|
||||
pBelowAC1 = sum(subv[1:scale] / sum(subv))
|
||||
#print(list(pBelowAC1=pBelowAC1, v1=v[1]))
|
||||
v[1] = v[1] + pBelowAC1
|
||||
}
|
||||
|
||||
list(ps = ps, pr = v)
|
||||
}
|
||||
|
||||
f(afs, c(1,2,3,5,10,50))
|
||||
|
||||
if ( F ) {
|
||||
scale = 100
|
||||
ex1 = expected(200000, 1000, 1e-8)
|
||||
ex2 = expected(200000*scale, 1000, 1e-8)
|
||||
i = 1:(200000*scale) %% scale == 1
|
||||
plot(ex2$ps[i], cumsum(ex1$pr), type="l",lty=3,lwd=3, log="x", col="red")
|
||||
points(ex2$ps[i], cumsum(ex2$pr)[i], type="l",lty=3,lwd=3, log="x")
|
||||
}
|
||||
|
||||
ex = expected(200000, 1000, 1e-8, T)
|
||||
points(ex$ps, ex$pr, type="l",lty=3,lwd=3)
|
||||
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
Package: gsalib
|
||||
Type: Package
|
||||
Title: Utility functions
|
||||
Version: 1.0
|
||||
Date: 2010-10-02
|
||||
Author: Kiran Garimella
|
||||
Maintainer: Kiran Garimella <kiran@broadinstitute.org>
|
||||
Description: Utility functions for GATK NGS analyses
|
||||
License: BSD
|
||||
LazyLoad: yes
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
gsa.error <- function(message) {
|
||||
message("");
|
||||
gsa.message("Error: **********");
|
||||
gsa.message(sprintf("Error: %s", message));
|
||||
gsa.message("Error: **********");
|
||||
message("");
|
||||
|
||||
traceback();
|
||||
|
||||
message("");
|
||||
stop(message, call. = FALSE);
|
||||
}
|
||||
|
|
@ -1,116 +0,0 @@
|
|||
.gsa.getargs.usage <- function(argspec, doc) {
|
||||
cargs = commandArgs();
|
||||
|
||||
usage = "Usage:";
|
||||
|
||||
fileIndex = grep("--file=", cargs);
|
||||
if (length(fileIndex) > 0) {
|
||||
progname = gsub("--file=", "", cargs[fileIndex[1]]);
|
||||
|
||||
usage = sprintf("Usage: Rscript %s [arguments]", progname);
|
||||
|
||||
if (!is.na(doc)) {
|
||||
message(sprintf("%s: %s\n", progname, doc));
|
||||
}
|
||||
}
|
||||
|
||||
message(usage);
|
||||
|
||||
for (argname in names(argspec)) {
|
||||
key = argname;
|
||||
defaultValue = 0;
|
||||
doc = "";
|
||||
|
||||
if (is.list(argspec[[argname]])) {
|
||||
defaultValue = argspec[[argname]]$value;
|
||||
doc = argspec[[argname]]$doc;
|
||||
}
|
||||
|
||||
message(sprintf(" -%-10s\t[default: %s]\t%s", key, defaultValue, doc));
|
||||
}
|
||||
|
||||
message("");
|
||||
|
||||
stop(call. = FALSE);
|
||||
}
|
||||
|
||||
gsa.getargs <- function(argspec, doc = NA) {
|
||||
argsenv = new.env();
|
||||
|
||||
for (argname in names(argspec)) {
|
||||
value = 0;
|
||||
if (is.list(argspec[[argname]])) {
|
||||
value = argspec[[argname]]$value;
|
||||
} else {
|
||||
value = argspec[[argname]];
|
||||
}
|
||||
|
||||
assign(argname, value, envir=argsenv);
|
||||
}
|
||||
|
||||
if (interactive()) {
|
||||
for (argname in names(argspec)) {
|
||||
value = get(argname, envir=argsenv);
|
||||
|
||||
if (is.na(value) | is.null(value)) {
|
||||
if (exists("cmdargs")) {
|
||||
assign(argname, cmdargs[[argname]], envir=argsenv);
|
||||
} else {
|
||||
assign(argname, readline(sprintf("Please enter a value for '%s': ", argname)), envir=argsenv);
|
||||
}
|
||||
} else {
|
||||
assign(argname, value, envir=argsenv);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cargs = commandArgs(TRUE);
|
||||
|
||||
if (length(cargs) == 0) {
|
||||
.gsa.getargs.usage(argspec, doc);
|
||||
}
|
||||
|
||||
for (i in 1:length(cargs)) {
|
||||
if (length(grep("^-", cargs[i], ignore.case=TRUE)) > 0) {
|
||||
key = gsub("-", "", cargs[i]);
|
||||
value = cargs[i+1];
|
||||
|
||||
if (key == "h" | key == "help") {
|
||||
.gsa.getargs.usage(argspec, doc);
|
||||
}
|
||||
|
||||
if (length(grep("^[\\d\\.e\\+\\-]+$", value, perl=TRUE, ignore.case=TRUE)) > 0) {
|
||||
value = as.numeric(value);
|
||||
}
|
||||
|
||||
assign(key, value, envir=argsenv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
args = as.list(argsenv);
|
||||
|
||||
isMissingArgs = 0;
|
||||
missingArgs = c();
|
||||
|
||||
for (arg in names(argspec)) {
|
||||
if (is.na(args[[arg]]) | is.null(args[[arg]])) {
|
||||
gsa.warn(sprintf("Value for required argument '-%s' was not specified", arg));
|
||||
|
||||
isMissingArgs = 1;
|
||||
missingArgs = c(missingArgs, arg);
|
||||
}
|
||||
}
|
||||
|
||||
if (isMissingArgs) {
|
||||
gsa.error(
|
||||
paste(
|
||||
"Missing required arguments: -",
|
||||
paste(missingArgs, collapse=" -"),
|
||||
". Specify -h or -help to this script for a list of available arguments.",
|
||||
sep=""
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
args;
|
||||
}
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
gsa.message <- function(message) {
|
||||
message(sprintf("[gsalib] %s", message));
|
||||
}
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
gsa.plot.venn <-
|
||||
function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0,
|
||||
col=c("#FF6342", "#63C6DE", "#ADDE63"),
|
||||
pos=c(0.20, 0.20, 0.80, 0.82),
|
||||
debug=0
|
||||
) {
|
||||
library(png);
|
||||
library(graphics);
|
||||
|
||||
# Set up properties
|
||||
for (i in 1:length(col)) {
|
||||
rgbcol = col2rgb(col[i]);
|
||||
col[i] = sprintf("%02X%02X%02X", rgbcol[1], rgbcol[2], rgbcol[3]);
|
||||
}
|
||||
|
||||
chco = paste(col[1], col[2], col[3], sep=",");
|
||||
chd = paste(a, b, c, a_and_b, a_and_c, b_and_c, sep=",");
|
||||
|
||||
props = c(
|
||||
'cht=v',
|
||||
'chs=525x525',
|
||||
'chds=0,10000000000',
|
||||
paste('chco=', chco, sep=""),
|
||||
paste('chd=t:', chd, sep="")
|
||||
);
|
||||
proplist = paste(props[1], props[2], props[3], props[4], props[5], sep='&');
|
||||
|
||||
# Get the venn diagram (as a temporary file)
|
||||
filename = tempfile("venn");
|
||||
cmd = paste("wget -O ", filename, " 'http://chart.apis.google.com/chart?", proplist, "' > /dev/null 2>&1", sep="");
|
||||
|
||||
if (debug == 1) {
|
||||
print(cmd);
|
||||
}
|
||||
system(cmd);
|
||||
|
||||
# Render the temp png file into a plotting frame
|
||||
a = readPNG(filename);
|
||||
|
||||
plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab="");
|
||||
if (c == 0 || a >= b) {
|
||||
rasterImage(a, pos[1], pos[2], pos[3], pos[4]);
|
||||
} else {
|
||||
rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180);
|
||||
}
|
||||
|
||||
# Clean up!
|
||||
unlink(filename);
|
||||
}
|
||||
|
||||
|
|
@ -1,83 +0,0 @@
|
|||
.gsa.attemptToLoadFile <- function(filename) {
|
||||
file = NA;
|
||||
|
||||
if (file.exists(filename) & file.info(filename)$size > 500) {
|
||||
file = read.csv(filename, header=TRUE, comment.char="#");
|
||||
}
|
||||
|
||||
file;
|
||||
}
|
||||
|
||||
gsa.read.eval <-
|
||||
function(evalRoot) {
|
||||
fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
|
||||
fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
|
||||
fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
|
||||
fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
|
||||
fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
|
||||
fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep="");
|
||||
fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
|
||||
fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
|
||||
fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
|
||||
fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
|
||||
fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
|
||||
fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
|
||||
fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
|
||||
fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
|
||||
|
||||
eval = list(
|
||||
AlleleCountStats = NA,
|
||||
CompOverlap = NA,
|
||||
CountVariants = NA,
|
||||
GenotypeConcordance = NA,
|
||||
MetricsByAc = NA,
|
||||
MetricsBySample = NA,
|
||||
Quality_Metrics_by_allele_count = NA,
|
||||
QualityScoreHistogram = NA,
|
||||
SampleStatistics = NA,
|
||||
SampleSummaryStatistics = NA,
|
||||
SimpleMetricsBySample = NA,
|
||||
TiTv = NA,
|
||||
TiTvStats = NA,
|
||||
Variant_Quality_Score = NA,
|
||||
|
||||
CallsetNames = c(),
|
||||
CallsetOnlyNames = c(),
|
||||
CallsetFilteredNames = c()
|
||||
);
|
||||
|
||||
eval$AlleleCountStats = .gsa.attemptToLoadFile(fileAlleleCountStats);
|
||||
eval$CompOverlap = .gsa.attemptToLoadFile(fileCompOverlap);
|
||||
eval$CountVariants = .gsa.attemptToLoadFile(fileCountVariants);
|
||||
eval$GenotypeConcordance = .gsa.attemptToLoadFile(fileGenotypeConcordance);
|
||||
eval$MetricsByAc = .gsa.attemptToLoadFile(fileMetricsByAc);
|
||||
eval$MetricsBySample = .gsa.attemptToLoadFile(fileMetricsBySample);
|
||||
eval$Quality_Metrics_by_allele_count = .gsa.attemptToLoadFile(fileQuality_Metrics_by_allele_count);
|
||||
eval$QualityScoreHistogram = .gsa.attemptToLoadFile(fileQualityScoreHistogram);
|
||||
eval$SampleStatistics = .gsa.attemptToLoadFile(fileSampleStatistics);
|
||||
eval$SampleSummaryStatistics = .gsa.attemptToLoadFile(fileSampleSummaryStatistics);
|
||||
eval$SimpleMetricsBySample = .gsa.attemptToLoadFile(fileSimpleMetricsBySample);
|
||||
eval$TiTv = .gsa.attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
|
||||
eval$TiTvStats = .gsa.attemptToLoadFile(fileTiTvStats);
|
||||
eval$Variant_Quality_Score = .gsa.attemptToLoadFile(fileVariant_Quality_Score);
|
||||
|
||||
uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
|
||||
eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
|
||||
eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
|
||||
eval$CallsetFilteredNames = as.vector(c(
|
||||
paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""),
|
||||
paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep=""))
|
||||
);
|
||||
|
||||
if (!(eval$CallsetFilteredNames[1] %in% unique(eval$TiTv$jexl_expression))) {
|
||||
eval$CallsetFilteredNames[1] = paste("In", eval$CallsetNames[1], "-FilteredIn", eval$CallsetNames[2], sep="");
|
||||
}
|
||||
|
||||
if (!(eval$CallsetFilteredNames[2] %in% unique(eval$TiTv$jexl_expression))) {
|
||||
eval$CallsetFilteredNames[2] = paste("In", eval$CallsetNames[2], "-FilteredIn", eval$CallsetNames[1], sep="");
|
||||
#eval$CallsetFilteredNames[2] = paste(gsub("^(\\w)", "In", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In", eval$CallsetNames[1], perl=TRUE), sep="");
|
||||
}
|
||||
|
||||
eval;
|
||||
}
|
||||
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
# Load a table into the specified environment. Make sure that each new table gets a unique name (this allows one to cat a bunch of tables with the same name together and load them into R without each table overwriting the last.
|
||||
.gsa.assignGATKTableToEnvironment <- function(tableName, tableHeader, tableRows, tableEnv) {
|
||||
d = data.frame(tableRows, row.names=NULL, stringsAsFactors=FALSE);
|
||||
colnames(d) = tableHeader;
|
||||
|
||||
for (i in 1:ncol(d)) {
|
||||
v = suppressWarnings(as.numeric(d[,i]));
|
||||
|
||||
if (length(na.omit(as.numeric(v))) == length(d[,i])) {
|
||||
d[,i] = v;
|
||||
}
|
||||
}
|
||||
|
||||
usedNames = ls(envir=tableEnv, pattern=tableName);
|
||||
|
||||
if (length(usedNames) > 0) {
|
||||
tableName = paste(tableName, ".", length(usedNames), sep="");
|
||||
}
|
||||
|
||||
assign(tableName, d, envir=tableEnv);
|
||||
}
|
||||
|
||||
# Load all GATKReport tables from a file
|
||||
gsa.read.gatkreport <- function(filename) {
|
||||
con = file(filename, "r", blocking = TRUE);
|
||||
lines = readLines(con);
|
||||
close(con);
|
||||
|
||||
tableEnv = new.env();
|
||||
|
||||
tableName = NA;
|
||||
tableHeader = c();
|
||||
tableRows = c();
|
||||
|
||||
for (line in lines) {
|
||||
if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
|
||||
headerFields = unlist(strsplit(line, "[[:space:]]+"));
|
||||
|
||||
if (!is.na(tableName)) {
|
||||
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
|
||||
}
|
||||
|
||||
tableName = headerFields[2];
|
||||
tableHeader = c();
|
||||
tableRows = c();
|
||||
} else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) {
|
||||
# do nothing
|
||||
} else if (!is.na(tableName)) {
|
||||
row = unlist(strsplit(line, "[[:space:]]+"));
|
||||
|
||||
if (length(tableHeader) == 0) {
|
||||
tableHeader = row;
|
||||
} else {
|
||||
tableRows = rbind(tableRows, row);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!is.na(tableName)) {
|
||||
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
|
||||
}
|
||||
|
||||
gatkreport = as.list(tableEnv);
|
||||
}
|
||||
|
|
@ -1,28 +0,0 @@
|
|||
gsa.read.squidmetrics = function(project, bylane = FALSE) {
|
||||
suppressMessages(library(ROracle));
|
||||
|
||||
drv = dbDriver("Oracle");
|
||||
con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD");
|
||||
|
||||
if (bylane) {
|
||||
statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS WHERE \"Project\" = '", project, "'", sep="");
|
||||
print(statement);
|
||||
|
||||
rs = dbSendQuery(con, statement = statement);
|
||||
d = fetch(rs, n=-1);
|
||||
dbHasCompleted(rs);
|
||||
dbClearResult(rs);
|
||||
} else {
|
||||
statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG WHERE \"Project\" = '", project, "'", sep="");
|
||||
print(statement);
|
||||
|
||||
rs = dbSendQuery(con, statement = statement);
|
||||
d = fetch(rs, n=-1);
|
||||
dbHasCompleted(rs);
|
||||
dbClearResult(rs);
|
||||
}
|
||||
|
||||
oraCloseDriver(drv);
|
||||
|
||||
subset(d, Project == project);
|
||||
}
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
gsa.warn <- function(message) {
|
||||
gsa.message(sprintf("Warning: %s", message));
|
||||
}
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
* Edit the help file skeletons in 'man', possibly combining help files
|
||||
for multiple functions.
|
||||
* Put any C/C++/Fortran code in 'src'.
|
||||
* If you have compiled code, add a .First.lib() function in 'R' to load
|
||||
the shared library.
|
||||
* Run R CMD build to build the package tarball.
|
||||
* Run R CMD check to check the package tarball.
|
||||
|
||||
Read "Writing R Extensions" for more information.
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 49 KiB |
|
|
@ -1,49 +0,0 @@
|
|||
\name{gsa.error}
|
||||
\alias{gsa.error}
|
||||
\title{
|
||||
GSA error
|
||||
}
|
||||
\description{
|
||||
Write an error message to standard out with the prefix '[gsalib] Error:', print a traceback, and exit.
|
||||
}
|
||||
\usage{
|
||||
gsa.error(message)
|
||||
}
|
||||
%- maybe also 'usage' for other objects documented here.
|
||||
\arguments{
|
||||
\item{message}{
|
||||
The error message to write.
|
||||
}
|
||||
}
|
||||
\details{
|
||||
%% ~~ If necessary, more details than the description above ~~
|
||||
}
|
||||
\value{
|
||||
%% ~Describe the value returned
|
||||
%% If it is a LIST, use
|
||||
%% \item{comp1 }{Description of 'comp1'}
|
||||
%% \item{comp2 }{Description of 'comp2'}
|
||||
%% ...
|
||||
}
|
||||
\references{
|
||||
%% ~put references to the literature/web site here ~
|
||||
}
|
||||
\author{
|
||||
Kiran Garimella
|
||||
}
|
||||
\note{
|
||||
%% ~~further notes~~
|
||||
}
|
||||
|
||||
%% ~Make other sections like Warning with \section{Warning }{....} ~
|
||||
|
||||
\seealso{
|
||||
%% ~~objects to See Also as \code{\link{help}}, ~~~
|
||||
}
|
||||
\examples{
|
||||
gsa.error("This is a message");
|
||||
}
|
||||
% Add one or more standard keywords, see file 'KEYWORDS' in the
|
||||
% R documentation directory.
|
||||
\keyword{ ~kwd1 }
|
||||
\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
|
||||
|
|
@ -1,57 +0,0 @@
|
|||
\name{gsa.getargs}
|
||||
\alias{gsa.getargs}
|
||||
\title{
|
||||
Get script arguments
|
||||
}
|
||||
\description{
|
||||
Get script arguments given a list object specifying arguments and documentation. Can be used in command-line or interactive mode. This is helpful when developing scripts in interactive mode that will eventually become command-line programs. If no arguments are specified or help is requested in command-line mode, the script will print out a usage statement with available arguments and exit.
|
||||
}
|
||||
\usage{
|
||||
gsa.getargs(argspec, doc = NA)
|
||||
}
|
||||
\arguments{
|
||||
\item{argspec}{
|
||||
A list object. Each key is an argument name. The value is another list object with a 'value' and 'doc' keys. For example:
|
||||
\preformatted{argspec = list(
|
||||
arg1 = list(value=10, doc="Info for optional arg1"),
|
||||
arg2 = list(value=NA, doc="Info for required arg2")
|
||||
);
|
||||
}
|
||||
|
||||
If the value provided is NA, the argument is considered required and must be specified when the script is invoked. For command-line mode, this means the argument must be specified on the command-line. In interactive mode, there are two ways of specifying these arguments. First, if a properly formatted list argument called 'cmdargs' is present in the current environment (i.e. the object returned by gsa.getargs() from a previous invocation), the value is taken from this object. Otherwise, the argument is prompted for.
|
||||
}
|
||||
|
||||
\item{doc}{
|
||||
An optional string succinctly documenting the purpose of the script.
|
||||
}
|
||||
}
|
||||
\details{
|
||||
Interactive scripts typically make use of hardcoded filepaths and parameter settings. This makes testing easy, but generalization to non-interactive mode more difficult. This utility provides a mechanism for writing scripts that work properly in both interactive and command-line modes.
|
||||
|
||||
To use this method, specify a list with key-value pairs representing the arguments as specified above. In command-line mode, if no arguments are specified or the user specifies '-h' or '-help' anywhere on the command string, a help message indicating available arguments, their default values, and some documentation about the argument are provided.
|
||||
}
|
||||
\value{
|
||||
Returns a list with keys matching the argspec and values representing the specified arguments.
|
||||
|
||||
\item{arg1 }{Value for argument 1}
|
||||
\item{arg2 }{Value for argument 2}
|
||||
...etc.
|
||||
}
|
||||
\references{
|
||||
%% ~put references to the literature/web site here ~
|
||||
}
|
||||
\author{
|
||||
Kiran Garimella
|
||||
}
|
||||
\examples{
|
||||
argspec = list(
|
||||
file = list(value="/my/test.vcf", doc="VCF file"),
|
||||
verbose = list(value=0, doc="If 1, set verbose mode"),
|
||||
test2 = list(value=2.3e9, doc="Another argument that does stuff")
|
||||
);
|
||||
|
||||
cmdargs = gsa.getargs(argspec, doc="My test program");
|
||||
|
||||
print(cmdargs$file); # will print '[1] "/my/test.vcf"'
|
||||
}
|
||||
\keyword{ ~kwd1 }
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
\name{gsa.message}
|
||||
\alias{gsa.message}
|
||||
\title{
|
||||
GSA message
|
||||
}
|
||||
\description{
|
||||
Write a message to standard out with the prefix '[gsalib]'.
|
||||
}
|
||||
\usage{
|
||||
gsa.message(message)
|
||||
}
|
||||
\arguments{
|
||||
\item{message}{
|
||||
The message to write.
|
||||
}
|
||||
}
|
||||
\details{
|
||||
%% ~~ If necessary, more details than the description above ~~
|
||||
}
|
||||
\value{
|
||||
%% ~Describe the value returned
|
||||
%% If it is a LIST, use
|
||||
%% \item{comp1 }{Description of 'comp1'}
|
||||
%% \item{comp2 }{Description of 'comp2'}
|
||||
%% ...
|
||||
}
|
||||
\references{
|
||||
%% ~put references to the literature/web site here ~
|
||||
}
|
||||
\author{
|
||||
Kiran Garimella
|
||||
}
|
||||
\note{
|
||||
%% ~~further notes~~
|
||||
}
|
||||
|
||||
\seealso{
|
||||
%% ~~objects to See Also as \code{\link{help}}, ~~~
|
||||
}
|
||||
\examples{
|
||||
## Write message to stdout
|
||||
gsa.message("This is a message");
|
||||
}
|
||||
\keyword{ ~kwd1 }
|
||||
|
|
@ -1,75 +0,0 @@
|
|||
\name{gsa.plot.venn}
|
||||
\alias{gsa.plot.venn}
|
||||
\title{
|
||||
Plot a proportional venn diagram
|
||||
}
|
||||
\description{
|
||||
Plot a proportional venn diagram (two or three-way venns allowed)
|
||||
}
|
||||
\usage{
|
||||
gsa.plot.venn(a, b, c = 0, a_and_b, a_and_c = 0, b_and_c = 0, col = c("#FF6342", "#63C6DE", "#ADDE63"), pos = c(0.2, 0.2, 0.8, 0.82), debug = 0)
|
||||
}
|
||||
\arguments{
|
||||
\item{a}{
|
||||
size of 'a' circle
|
||||
}
|
||||
\item{b}{
|
||||
size of 'b' circle
|
||||
}
|
||||
\item{c}{
|
||||
size of 'c' circle
|
||||
}
|
||||
\item{a_and_b}{
|
||||
size of a and b overlap
|
||||
}
|
||||
\item{a_and_c}{
|
||||
size of a and c overlap
|
||||
}
|
||||
\item{b_and_c}{
|
||||
size of b and c overlap
|
||||
}
|
||||
\item{col}{
|
||||
vector of colors for each venn piece
|
||||
}
|
||||
\item{pos}{
|
||||
vector of positional elements
|
||||
}
|
||||
\item{debug}{
|
||||
if 1, set debug mode and print useful information
|
||||
}
|
||||
}
|
||||
\details{
|
||||
Plots a two-way or three-way proportional Venn diagram. Internally, this method uses the Google Chart API to generate the diagram, then renders it into the plot window where it can be annotated in interesting ways.
|
||||
}
|
||||
\value{
|
||||
%% ~Describe the value returned
|
||||
%% If it is a LIST, use
|
||||
%% \item{comp1 }{Description of 'comp1'}
|
||||
%% \item{comp2 }{Description of 'comp2'}
|
||||
%% ...
|
||||
}
|
||||
\references{
|
||||
}
|
||||
\author{
|
||||
Kiran Garimella
|
||||
}
|
||||
\note{
|
||||
%% ~~further notes~~
|
||||
}
|
||||
|
||||
%% ~Make other sections like Warning with \section{Warning }{....} ~
|
||||
|
||||
\seealso{
|
||||
%% ~~objects to See Also as \code{\link{help}}, ~~~
|
||||
}
|
||||
\examples{
|
||||
## Plot a two-way Venn diagram
|
||||
gsa.plot.venn(1000, 750, 0, 400);
|
||||
|
||||
## Plot a three-way Venn diagram
|
||||
gsa.plot.venn(1000, 750, 900, 400, 650, 500);
|
||||
}
|
||||
% Add one or more standard keywords, see file 'KEYWORDS' in the
|
||||
% R documentation directory.
|
||||
\keyword{ ~kwd1 }
|
||||
\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
|
||||
|
|
@ -1,111 +0,0 @@
|
|||
\name{gsa.read.eval}
|
||||
\alias{gsa.read.eval}
|
||||
\title{
|
||||
Read a VariantEval file
|
||||
}
|
||||
\description{
|
||||
Read a VariantEval file that's output in R format.
|
||||
}
|
||||
\usage{
|
||||
gsa.read.eval(evalRoot)
|
||||
}
|
||||
%- maybe also 'usage' for other objects documented here.
|
||||
\arguments{
|
||||
\item{evalRoot}{
|
||||
%% ~~Describe \code{evalRoot} here~~
|
||||
}
|
||||
}
|
||||
\details{
|
||||
%% ~~ If necessary, more details than the description above ~~
|
||||
}
|
||||
\value{
|
||||
%% ~Describe the value returned
|
||||
%% If it is a LIST, use
|
||||
%% \item{comp1 }{Description of 'comp1'}
|
||||
%% \item{comp2 }{Description of 'comp2'}
|
||||
%% ...
|
||||
}
|
||||
\references{
|
||||
%% ~put references to the literature/web site here ~
|
||||
}
|
||||
\author{
|
||||
%% ~~who you are~~
|
||||
}
|
||||
\note{
|
||||
%% ~~further notes~~
|
||||
}
|
||||
|
||||
%% ~Make other sections like Warning with \section{Warning }{....} ~
|
||||
|
||||
\seealso{
|
||||
%% ~~objects to See Also as \code{\link{help}}, ~~~
|
||||
}
|
||||
\examples{
|
||||
##---- Should be DIRECTLY executable !! ----
|
||||
##-- ==> Define data, use random,
|
||||
##-- or do help(data=index) for the standard data sets.
|
||||
|
||||
## The function is currently defined as
|
||||
function(evalRoot) {
|
||||
fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
|
||||
fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
|
||||
fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
|
||||
fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
|
||||
fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
|
||||
fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep="");
|
||||
fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
|
||||
fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
|
||||
fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
|
||||
fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
|
||||
fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
|
||||
fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
|
||||
fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
|
||||
fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
|
||||
|
||||
eval = list(
|
||||
AlleleCountStats = NA,
|
||||
CompOverlap = NA,
|
||||
CountVariants = NA,
|
||||
GenotypeConcordance = NA,
|
||||
MetricsByAc = NA,
|
||||
MetricsBySample = NA,
|
||||
Quality_Metrics_by_allele_count = NA,
|
||||
QualityScoreHistogram = NA,
|
||||
SampleStatistics = NA,
|
||||
SampleSummaryStatistics = NA,
|
||||
SimpleMetricsBySample = NA,
|
||||
TiTv = NA,
|
||||
TiTvStats = NA,
|
||||
Variant_Quality_Score = NA,
|
||||
|
||||
CallsetNames = c(),
|
||||
CallsetOnlyNames = c(),
|
||||
CallsetFilteredNames = c()
|
||||
);
|
||||
|
||||
eval$AlleleCountStats = .attemptToLoadFile(fileAlleleCountStats);
|
||||
eval$CompOverlap = .attemptToLoadFile(fileCompOverlap);
|
||||
eval$CountVariants = .attemptToLoadFile(fileCountVariants);
|
||||
eval$GenotypeConcordance = .attemptToLoadFile(fileGenotypeConcordance);
|
||||
eval$MetricsByAc = .attemptToLoadFile(fileMetricsByAc);
|
||||
eval$MetricsBySample = .attemptToLoadFile(fileMetricsBySample);
|
||||
eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count);
|
||||
eval$QualityScoreHistogram = .attemptToLoadFile(fileQualityScoreHistogram);
|
||||
eval$SampleStatistics = .attemptToLoadFile(fileSampleStatistics);
|
||||
eval$SampleSummaryStatistics = .attemptToLoadFile(fileSampleSummaryStatistics);
|
||||
eval$SimpleMetricsBySample = .attemptToLoadFile(fileSimpleMetricsBySample);
|
||||
eval$TiTv = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
|
||||
eval$TiTvStats = .attemptToLoadFile(fileTiTvStats);
|
||||
eval$Variant_Quality_Score = .attemptToLoadFile(fileVariant_Quality_Score);
|
||||
|
||||
uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
|
||||
eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
|
||||
eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
|
||||
eval$CallsetFilteredNames = as.vector(c());
|
||||
eval;
|
||||
}
|
||||
}
|
||||
% Add one or more standard keywords, see file 'KEYWORDS' in the
|
||||
% R documentation directory.
|
||||
\keyword{ ~kwd1 }
|
||||
\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
|
||||
|
|
@ -1,55 +0,0 @@
|
|||
\name{gsa.read.gatkreport}
|
||||
\alias{gsa.read.gatkreport}
|
||||
\title{
|
||||
gsa.read.gatkreport
|
||||
}
|
||||
\description{
|
||||
Reads a GATKReport file - a multi-table document - and loads each table as a separate data.frame object in a list.
|
||||
}
|
||||
\usage{
|
||||
gsa.read.gatkreport(filename)
|
||||
}
|
||||
\arguments{
|
||||
\item{filename}{
|
||||
The path to the GATKReport file.
|
||||
}
|
||||
}
|
||||
\details{
|
||||
The GATKReport format replaces the multi-file output format used by many GATK tools and provides a single, consolidated file format. This format accomodates multiple tables and is still R-loadable - through this function.
|
||||
|
||||
The file format looks like this:
|
||||
\preformatted{##:GATKReport.v0.1 TableName : The description of the table
|
||||
col1 col2 col3
|
||||
0 0.007451835696110506 25.474613284804366
|
||||
1 0.002362777171937477 29.844949954504095
|
||||
2 9.087604507451836E-4 32.87590975254731
|
||||
3 5.452562704471102E-4 34.498999090081895
|
||||
4 9.087604507451836E-4 35.14831665150137
|
||||
}
|
||||
|
||||
}
|
||||
\value{
|
||||
Returns a list object, where each key is the TableName and the value is the data.frame object with the contents of the table. If multiple tables with the same name exist, each one after the first will be given names of "TableName.v1", "TableName.v2", ..., "TableName.vN".
|
||||
%% ~Describe the value returned
|
||||
%% If it is a LIST, use
|
||||
%% \item{comp1 }{Description of 'comp1'}
|
||||
%% \item{comp2 }{Description of 'comp2'}
|
||||
%% ...
|
||||
}
|
||||
\references{
|
||||
%% ~put references to the literature/web site here ~
|
||||
}
|
||||
\author{
|
||||
Kiran Garimella
|
||||
}
|
||||
\note{
|
||||
%% ~~further notes~~
|
||||
}
|
||||
|
||||
\seealso{
|
||||
%% ~~objects to See Also as \code{\link{help}}, ~~~
|
||||
}
|
||||
\examples{
|
||||
report = gsa.read.gatkreport("/path/to/my/output.gatkreport");
|
||||
}
|
||||
\keyword{ ~kwd1 }
|
||||
|
|
@ -1,48 +0,0 @@
|
|||
\name{gsa.read.squidmetrics}
|
||||
\alias{gsa.read.squidmetrics}
|
||||
\title{
|
||||
gsa.read.squidmetrics
|
||||
}
|
||||
\description{
|
||||
Reads metrics for a specified SQUID project into a dataframe.
|
||||
}
|
||||
\usage{
|
||||
gsa.read.squidmetrics("C315")
|
||||
}
|
||||
\arguments{
|
||||
\item{project}{
|
||||
The project for which metrics should be obtained.
|
||||
}
|
||||
\item{bylane}{
|
||||
If TRUE, obtains per-lane metrics rather than the default per-sample metrics.
|
||||
}
|
||||
}
|
||||
\details{
|
||||
%% ~~ If necessary, more details than the description above ~~
|
||||
}
|
||||
\value{
|
||||
%% ~Describe the value returned
|
||||
%% If it is a LIST, use
|
||||
%% \item{comp1 }{Description of 'comp1'}
|
||||
%% \item{comp2 }{Description of 'comp2'}
|
||||
%% ...
|
||||
Returns a data frame with samples (or lanes) as the row and the metric as the column.
|
||||
}
|
||||
\references{
|
||||
%% ~put references to the literature/web site here ~
|
||||
}
|
||||
\author{
|
||||
Kiran Garimella
|
||||
}
|
||||
\note{
|
||||
This method will only work within the Broad Institute internal network.
|
||||
}
|
||||
|
||||
\seealso{
|
||||
%% ~~objects to See Also as \code{\link{help}}, ~~~
|
||||
}
|
||||
\examples{
|
||||
## Obtain metrics for project C315.
|
||||
d = gsa.read.squidmetrics("C315");
|
||||
}
|
||||
\keyword{ ~kwd1 }
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
\name{gsa.warn}
|
||||
\alias{gsa.warn}
|
||||
\title{
|
||||
GSA warn
|
||||
}
|
||||
\description{
|
||||
Write a warning message to standard out with the prefix '[gsalib] Warning:'.
|
||||
}
|
||||
\usage{
|
||||
gsa.warn(message)
|
||||
}
|
||||
%- maybe also 'usage' for other objects documented here.
|
||||
\arguments{
|
||||
\item{message}{
|
||||
The warning message to write.
|
||||
}
|
||||
}
|
||||
\details{
|
||||
%% ~~ If necessary, more details than the description above ~~
|
||||
}
|
||||
\value{
|
||||
%% ~Describe the value returned
|
||||
%% If it is a LIST, use
|
||||
%% \item{comp1 }{Description of 'comp1'}
|
||||
%% \item{comp2 }{Description of 'comp2'}
|
||||
%% ...
|
||||
}
|
||||
\references{
|
||||
%% ~put references to the literature/web site here ~
|
||||
}
|
||||
\author{
|
||||
Kiran Garimella
|
||||
}
|
||||
\note{
|
||||
%% ~~further notes~~
|
||||
}
|
||||
|
||||
\seealso{
|
||||
%% ~~objects to See Also as \code{\link{help}}, ~~~
|
||||
}
|
||||
\examples{
|
||||
## Write message to stdout
|
||||
gsa.warn("This is a warning message");
|
||||
}
|
||||
\keyword{ ~kwd1 }
|
||||
\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
|
||||
|
|
@ -1,68 +0,0 @@
|
|||
\name{gsalib-package}
|
||||
\alias{gsalib-package}
|
||||
\alias{gsalib}
|
||||
\docType{package}
|
||||
\title{
|
||||
GATK utility analysis functions
|
||||
}
|
||||
\description{
|
||||
Utility functions for analyzing GATK-processed NGS data
|
||||
}
|
||||
\details{
|
||||
This package contains functions for working with GATK-processed NGS data. These functions include a command-line parser that also allows a script to be used in interactive mode (good for developing scripts that will eventually be automated), a proportional Venn diagram generator, convenience methods for parsing VariantEval output, and more.
|
||||
}
|
||||
\author{
|
||||
Genome Sequencing and Analysis Group
|
||||
|
||||
Medical and Population Genetics Program
|
||||
|
||||
Maintainer: Kiran Garimella
|
||||
}
|
||||
\references{
|
||||
GSA wiki page: http://www.broadinstitute.org/gsa/wiki
|
||||
|
||||
GATK help forum: http://www.getsatisfaction.com/gsa
|
||||
}
|
||||
\examples{
|
||||
## get script arguments in interactive and non-interactive mode
|
||||
cmdargs = gsa.getargs( list(
|
||||
requiredArg1 = list(
|
||||
value = NA,
|
||||
doc = "Documentation for requiredArg1"
|
||||
),
|
||||
|
||||
optionalArg1 = list(
|
||||
value = 3e9,
|
||||
doc = "Documentation for optionalArg1"
|
||||
)
|
||||
) );
|
||||
|
||||
## plot a proportional Venn diagram
|
||||
gsa.plot.venn(500, 250, 0, 100);
|
||||
|
||||
## read a GATKReport file
|
||||
report = gsa.gatk.report("/path/to/my/output.gatkreport");
|
||||
|
||||
## emit a message
|
||||
gsa.message("This is a message");
|
||||
|
||||
## emit a warning message
|
||||
gsa.message("This is a warning message");
|
||||
|
||||
## emit an error message
|
||||
gsa.message("This is an error message");
|
||||
|
||||
## read the SQUID metrics for a given sequencing project (internal to the Broad only)
|
||||
s = gsa.read.squidmetrics("C427");
|
||||
|
||||
## read command-line arguments
|
||||
cmdargs = gsa.getargs(
|
||||
list(
|
||||
file = list(value="/my/test.vcf", doc="VCF file"),
|
||||
verbose = list(value=0, doc="If 1, set verbose mode"),
|
||||
test2 = list(value=2.3e9, doc="Another argument that does stuff")
|
||||
),
|
||||
doc="My test program"
|
||||
);
|
||||
}
|
||||
\keyword{ package }
|
||||
245
R/tearsheet.r
245
R/tearsheet.r
|
|
@ -1,245 +0,0 @@
|
|||
#Before executing this file, save squid files as csv, then as tab deliminated files with only the column values as the header, change the format of all cells to numbers. Assign the path to these files to "samples" and "lanes" respectively.
|
||||
#testcomment
|
||||
args<-commandArgs(TRUE)
|
||||
lanes<-args[1]
|
||||
samples<-args[2]
|
||||
sample_sets<-args[3]
|
||||
eval<-args[4]
|
||||
noveltitv<-args[5]
|
||||
knowntitv<-args[6]
|
||||
DOC<-args[7]
|
||||
|
||||
if(is.na(sample_sets)){
|
||||
print("Please specify sample set for file naming and press enter.")
|
||||
scan("stdin", what="character",n=1)->sample_sets
|
||||
print("Thanks!")
|
||||
}
|
||||
|
||||
if(is.na(lanes) == FALSE && is.na(samples)==FALSE){
|
||||
#this makes a table & graphs using Picard data
|
||||
read.delim(file=lanes, header= TRUE)->bylane;
|
||||
read.delim(file=samples, header= TRUE)->bysample;
|
||||
|
||||
#Calc by lane metrics
|
||||
attach(bylane);
|
||||
callable.target<-HS_TARGET_TERRITORY[1];
|
||||
singlelanes<-length(which(Lane.Type=="Single"));
|
||||
pairedlanes<-length(which(Lane.Type=="Paired"));
|
||||
mean.read.lane<-signif(mean(AL_TOTAL_READS, na.rm=TRUE));
|
||||
sd.read.lane<-signif(sd(AL_TOTAL_READS, na.rm=TRUE));
|
||||
mean.ub.lane<-signif(mean(HS_ON_TARGET_BASES, na.rm=TRUE));
|
||||
sd.ub.lane<-signif(sd(HS_ON_TARGET_BASES, na.rm=TRUE));
|
||||
mean.cov.lane<-round(mean(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
|
||||
sd.cov.lane<-round(sd(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
|
||||
mean.10x.lane<-round(mean(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
|
||||
mean.20x.lane<-round(mean(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
|
||||
mean.30x.lane<-round(mean(HS_PCT_TARGET_BASES_30X, na.rm=TRUE));
|
||||
sd.10x.lane<-round(sd(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
|
||||
sd.20x.lane<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
|
||||
sd.30x.lane<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE));
|
||||
|
||||
|
||||
names<-paste(Project, " ", External.ID, "-", Lane, sep="")
|
||||
|
||||
#makes a plot of the number of SNPS called per lane
|
||||
library(graphics)
|
||||
|
||||
pdf(file=paste(sample_sets, "_SNPS.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS))
|
||||
|
||||
layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE)
|
||||
plot(1:length(SNP_TOTAL_SNPS), main="SNPs Called in Each Lane", SNP_TOTAL_SNPS, xlab="", ylab="SNPs Called in Lane", xaxt="n", pch=16, col="blue")
|
||||
axis(side=1, at=(1:length(SNP_TOTAL_SNPS)), labels=names, cex.axis=0.75, las=2)
|
||||
|
||||
boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called")
|
||||
|
||||
|
||||
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){
|
||||
mtext("No outliers", side=1, line=4)
|
||||
}else{
|
||||
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
|
||||
}
|
||||
|
||||
|
||||
dev.off()
|
||||
|
||||
#makes SNP plot in log scale
|
||||
|
||||
pdf(file=paste(sample_sets, "_SNPS_log.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS))
|
||||
|
||||
layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE)
|
||||
plot(1:length(SNP_TOTAL_SNPS), log(SNP_TOTAL_SNPS), main="SNPs Called in Each Lane", xlab="", ylab="Log(SNPs Called in Lane)", xaxt="n", pch=16, col="blue")
|
||||
par(ylog=TRUE)
|
||||
axis(side=1, at=(1:length(SNP_TOTAL_SNPS)), labels=names, cex.axis=0.75, las=2)
|
||||
|
||||
boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called")
|
||||
|
||||
|
||||
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){
|
||||
mtext("No outliers", side=1, line=4)
|
||||
}else{
|
||||
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
||||
#makes a plot of snp calls ordered by lane
|
||||
pdf(file=paste(sample_sets, "_SNPS_lane.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS))
|
||||
|
||||
layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE)
|
||||
plot(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS[order(Lane)], main="SNPs Called in Each Lane", xlab="", ylab="Log(SNPs Called in Lane)", xaxt="n", pch=16, col="blue")
|
||||
par(ylog=TRUE)
|
||||
axis(side=1, at=(1:length(SNP_TOTAL_SNPS)), labels=names[order(Lane)], cex.axis=0.75, las=2)
|
||||
|
||||
boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called")
|
||||
|
||||
|
||||
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){
|
||||
mtext("No outliers", side=1, line=4)
|
||||
}else{
|
||||
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
||||
#makes a plot of fingerprint calls and labels them good or bad
|
||||
badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15))
|
||||
|
||||
colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS)))
|
||||
colors[badsnps]<-"Red"
|
||||
|
||||
pdf(file=paste(sample_sets, "_Fingerprints.pdf", sep=""), width=.2*length(FP_CONFIDENT_CALLS), height=.1*length(FP_CONFIDENT_CALLS))
|
||||
par(mar=c(6, 4, 5, 4))
|
||||
plot(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=16, ylim=c(0,24), ylab="Fingerprint calls", xlab="", xaxt="n", col=colors, main="Fingerprint Calling and Matching")
|
||||
points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_CALLS, col=colors)
|
||||
axis(side=1, at=(1:length(FP_CONFIDENT_CALLS)), labels=names, cex.axis=0.75, las=2)
|
||||
|
||||
if(length(badsnps)>0){
|
||||
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes"), pch=c(1, 16, 1, 16), col=c("Blue", "Blue", "Red", "Red"))
|
||||
mtext("Some problematic fingerprint sites", side=3)
|
||||
}else{
|
||||
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane"), pch=c(1, 16), col="Blue")
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
||||
detach(bylane)
|
||||
|
||||
#Calc by sample metrics
|
||||
attach(bysample);
|
||||
mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE));
|
||||
sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE));
|
||||
mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE));
|
||||
sd.mrl.samp<-signif(sd(Mean.Read.Length, na.rm=TRUE));
|
||||
mean.read.samp<-signif(mean(Total.Reads, na.rm=TRUE));
|
||||
sd.read.samp<-signif(sd(Total.Reads, na.rm=TRUE));
|
||||
mean.ub.samp<-signif(mean(On.Target.Bases..HS., na.rm=TRUE));
|
||||
sd.ub.samp<-signif(sd(On.Target.Bases..HS., na.rm=TRUE));
|
||||
mean.cov.samp<-round(mean(Mean.Target.Coverage..HS., na.rm=TRUE));
|
||||
sd.cov.samp<-round(sd(Mean.Target.Coverage..HS., na.rm=TRUE));
|
||||
mean.10x.samp<-round(mean(PCT.Target.Bases.10x..HS., na.rm=TRUE));
|
||||
mean.20x.samp<-round(mean(PCT.Target.Bases.20x..HS., na.rm=TRUE));
|
||||
mean.30x.samp<-round(mean(PCT.Target.Bases.30x..HS., na.rm=TRUE));
|
||||
sd.10x.samp<-round(sd(PCT.Target.Bases.10x..HS., na.rm=TRUE));
|
||||
sd.20x.samp<-round(sd(PCT.Target.Bases.20x..HS., na.rm=TRUE));
|
||||
sd.30x.samp<-round(sd(PCT.Target.Bases.30x..HS., na.rm=TRUE));
|
||||
|
||||
detach(bysample);
|
||||
|
||||
#print all of this stuff out in R.
|
||||
print(paste("Callable Target: ", callable.target, " bases", sep=""), quote = FALSE);
|
||||
print(paste("Used Lanes per Sample: ", mean.lanes.samp, " +/- ", sd.lanes.samp, sep=""), quote=FALSE);
|
||||
print(paste("Parities: ", singlelanes, " single lanes, ", pairedlanes, " paired lanes", sep=""), quote=FALSE);
|
||||
print(paste("Read Legnths: ", mean.mrl.samp, " +/- ", sd.mrl.samp, sep=""), quote = FALSE);
|
||||
print(paste("Reads per lane: ", mean.read.lane, " +/- ", sd.read.lane, sep=""), quote = FALSE);
|
||||
print(paste("Reads per sample: ", mean.read.samp, " +/- ", sd.read.samp, sep=""), quote = FALSE);
|
||||
print(paste("Used bases per lane: ", mean.ub.lane, " +/- ", sd.ub.lane, sep=""), quote = FALSE);
|
||||
print(paste("Used bases per sample: ", mean.ub.samp, " +/- ", sd.ub.samp, sep=""), quote = FALSE)
|
||||
print(paste("Average target coverage per lane: ", mean.cov.lane, " +/- ", sd.cov.lane, sep=""), quote = FALSE);
|
||||
print(paste("Average target coverage per sample: ", mean.cov.samp, " +/- ", sd.cov.samp, sep=""), quote = FALSE);
|
||||
print(paste("% loci covered to 10x per lane: ", mean.10x.lane, "% +/- ", sd.10x.lane, "%", sep=""), quote = FALSE)
|
||||
print(paste("% loci covered to 10x per sample: ", mean.10x.samp, " +/- ", sd.10x.samp, "%", sep=""), quote = FALSE)
|
||||
print(paste("% loci covered to 20x per lane: ", mean.20x.lane, "% +/- ", sd.20x.lane, "%", sep=""), quote = FALSE)
|
||||
print(paste("% loci covered to 20x per sample: ", mean.20x.samp, "% +/- ", sd.20x.samp, "%", sep=""), quote = FALSE)
|
||||
print(paste("% loci covered to 30x per lane: ", mean.30x.lane, "% +/- ", sd.30x.lane, "%", sep=""), quote = FALSE)
|
||||
print(paste("% loci covered to 30x per sample: ", mean.30x.samp, "% +/- ", sd.30x.samp, "%", sep=""), quote = FALSE)
|
||||
|
||||
}else{
|
||||
print("Lane and Sample metrics file paths not provided")
|
||||
}
|
||||
|
||||
|
||||
|
||||
#Makes Error Rate percycle graph
|
||||
if(is.na(eval)==FALSE){
|
||||
read.delim(eval, header=TRUE)[2:ncol(read.delim(eval, header=TRUE))]->errpercycle
|
||||
|
||||
pdf(paste(sample_sets, "_errorrate_per_cycle.pdf", sep=""), width=6, height=5)
|
||||
|
||||
crazies<-which(errpercycle[75,]>0.3) #this can be changed to any kind of filter for particular lanes
|
||||
|
||||
colors<-rainbow(ncol(errpercycle), s=0.5, v=0.5)
|
||||
colors[crazies]<-rainbow(length(crazies))
|
||||
weights<-rep(1, ncol(errpercycle))
|
||||
weights[crazies]<-2
|
||||
|
||||
matplot(errpercycle, type="l", lty="solid", col=colors, lwd=weights, main="Error Rate per Cycle", ylab="Error Rate", xlab="Cycle", ylim=c(0, 0.7))
|
||||
|
||||
if(length(crazies)>0){
|
||||
legend("topleft", title="Unusual Lanes", legend=colnames(errpercycle)[crazies], lty="solid", lwd=2, col=colors[crazies], xjust=0.5)
|
||||
}else{
|
||||
legend("topleft", legend="No unusual lanes.", bty="n")
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
||||
}else{
|
||||
print("Error Rate Per Cycle file paths not provided")
|
||||
}
|
||||
|
||||
#Makes TI/TV known v novel graph
|
||||
if(is.na(noveltitv)==FALSE && is.na(knowntitv) == FALSE){
|
||||
pdf(paste(sample_set, "_TiTv.pdf", sep=""), width=6, height=5)
|
||||
|
||||
read.table(file=noveltitv, header=FALSE)->novels
|
||||
read.table(file=knowntitv, header=FALSE)->knowns
|
||||
|
||||
plot(novels[,2], col="red", ylim=c(0, 3.5), main="Ti/Tv for Novel and Known SNP calls", ylab="Ti/Tv", xlab="", xaxt="n")
|
||||
points(knowns[,2], col="blue")
|
||||
|
||||
axis(side=1, at=(1:length(novels[,2])), labels=novels[,1], cex.axis=1, las=2)
|
||||
|
||||
legend("bottomright", legend=c("Known Variants", "Novel Variants"), col=c("blue", "red"), pch=1, xjust=0.5)
|
||||
mtext("Lower Ti/Tv ratios indicated more false positive SNP calls.", side=1)
|
||||
dev.off()
|
||||
}else{
|
||||
print("Transition/transversion ratio file paths not provided")
|
||||
|
||||
}
|
||||
|
||||
#Make DOC graph
|
||||
if(is.na(DOC)==FALSE){
|
||||
pdf(paste(sample_set, "_DOC.pdf", sep=""), width=6, height=5)
|
||||
|
||||
as.matrix(as.vector(read.delim(DOC, header=TRUE)[,2:502]))->DOCdata
|
||||
DOCdata<-matrix(DOCdata*100/sum(DOCdata[1,]), nrow=501, ncol=29, byrow=TRUE)
|
||||
colnames(DOCdata)<-read.delim(DOC, header=TRUE)[,1]
|
||||
oddies<-which(apply(DOCdata, 2, max)>10) #can be assigned any particular heuristic
|
||||
ncolors<-rainbow(ncol(DOCdata), s=0.5, v=0.5)
|
||||
ncolors[oddies]<-rainbow(length(oddies))
|
||||
nweights<-rep(1, ncol(DOCdata))
|
||||
nweights[oddies]<-2
|
||||
matplot(DOCdata, type="l", main="Depth of Coverage by Sample", ylab="Percent bases covered to a given depth", xlab="log(Depth)", log="x", col=ncolors, lty="solid", lwd=nweights)
|
||||
|
||||
if(length(oddies)>0){
|
||||
legend("topright", title="Unusual Cases", legend=colnames(DOCdata)[oddies], lty="solid", lwd=2, col=ncolors[oddies], xjust=0.5)
|
||||
}else{
|
||||
legend("topright", legend="No unusual cases.", bty="n")
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
||||
}else{
|
||||
print("Depth of Coverage filepath not provided")
|
||||
}
|
||||
|
||||
|
||||
138
R/titvFPEst.R
138
R/titvFPEst.R
|
|
@ -1,138 +0,0 @@
|
|||
titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
|
||||
|
||||
titvFPEstV <- function(titvExpected, titvs) {
|
||||
sapply(titvs, function(x) titvFPEst(titvExpected, x))
|
||||
}
|
||||
|
||||
calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
|
||||
TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
|
||||
2 * TP / 3 / callable
|
||||
}
|
||||
|
||||
marginalTiTv <- function( nx, titvx, ny, titvy ) {
|
||||
tvx = nx / (titvx + 1)
|
||||
tix = nx - tvx
|
||||
tvy = ny / (titvy + 1)
|
||||
tiy = ny - tvy
|
||||
tiz = tix - tiy
|
||||
tvz = tvx - tvy
|
||||
return(tiz / tvz)
|
||||
}
|
||||
marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
|
||||
knownx = nx * dbx / 100
|
||||
novelx = nx - knownx
|
||||
knowny = ny * dby / 100
|
||||
novely = ny - knowny
|
||||
knownz = knownx - knowny
|
||||
novelz = novelx - novely
|
||||
return(knownz / ( knownz + novelz ) * 100)
|
||||
}
|
||||
|
||||
numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
|
||||
nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
|
||||
return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
|
||||
}
|
||||
|
||||
normalize <- function(x) {
|
||||
x / sum(x)
|
||||
}
|
||||
|
||||
normcumsum <- function(x) {
|
||||
cumsum(normalize(x))
|
||||
}
|
||||
|
||||
cumhist <- function(d, ...) {
|
||||
plot(d[order(d)], type="b", col="orange", lwd=2, ...)
|
||||
}
|
||||
|
||||
revcumsum <- function(x) {
|
||||
return(rev(cumsum(rev(x))))
|
||||
}
|
||||
|
||||
phred <- function(x) {
|
||||
log10(max(x,10^(-9.9)))*-10
|
||||
}
|
||||
|
||||
pOfB <- function(b, B, Q) {
|
||||
#print(paste(b, B, Q))
|
||||
p = 1 - 10^(-Q/10)
|
||||
if ( b == B )
|
||||
return(p)
|
||||
else
|
||||
return(1 - p)
|
||||
}
|
||||
|
||||
pOfG <- function(bs, qs, G) {
|
||||
a1 = G[1]
|
||||
a2 = G[2]
|
||||
|
||||
log10p = 0
|
||||
for ( i in 1:length(bs) ) {
|
||||
b = bs[i]
|
||||
q = qs[i]
|
||||
p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
|
||||
log10p = log10p + log10(p1)
|
||||
}
|
||||
|
||||
return(log10p)
|
||||
}
|
||||
|
||||
pOfGs <- function(nAs, nBs, Q) {
|
||||
bs = c(rep("a", nAs), rep("t", nBs))
|
||||
qs = rep(Q, nAs + nBs)
|
||||
G1 = c("a", "a")
|
||||
G2 = c("a", "t")
|
||||
G3 = c("t", "t")
|
||||
|
||||
log10p1 = pOfG(bs, qs, G1)
|
||||
log10p2 = pOfG(bs, qs, G2)
|
||||
log10p3 = pOfG(bs, qs, G3)
|
||||
Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
|
||||
|
||||
return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
|
||||
}
|
||||
|
||||
QsampleExpected <- function(depth, Q) {
|
||||
weightedAvg = 0
|
||||
for ( d in 1:(depth*3) ) {
|
||||
Qsample = 0
|
||||
pOfD = dpois(d, depth)
|
||||
for ( nBs in 0:d ) {
|
||||
pOfnB = dbinom(nBs, d, 0.5)
|
||||
nAs = d - nBs
|
||||
Qsample = pOfGs(nAs, nBs, Q)$Qsample
|
||||
#Qsample = 1
|
||||
weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
|
||||
print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
|
||||
}
|
||||
}
|
||||
|
||||
return(weightedAvg)
|
||||
}
|
||||
|
||||
plotQsamples <- function(depths, Qs, Qmax) {
|
||||
cols = rainbow(length(Qs))
|
||||
plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
|
||||
|
||||
for ( i in 1:length(Qs) ) {
|
||||
Q = Qs[i]
|
||||
y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
|
||||
points(depths, y, col=cols[i], type="b")
|
||||
}
|
||||
|
||||
legend("topleft", paste("Q", Qs), fill=cols)
|
||||
}
|
||||
|
||||
pCallHetGivenDepth <- function(depth, nallelesToCall) {
|
||||
depths = 0:(2*depth)
|
||||
pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
|
||||
dpois(depths,depth)*(1-pNoAllelesToCall)
|
||||
}
|
||||
|
||||
pCallHets <- function(depth, nallelesToCall) {
|
||||
sum(pCallHetGivenDepth(depth,nallelesToCall))
|
||||
}
|
||||
|
||||
pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
|
||||
1-(1-pCallHets(depth,nallelesToCall))^nsamples
|
||||
}
|
||||
|
|
@ -1,120 +0,0 @@
|
|||
count_zeros = function(list) {
|
||||
zeros = 0
|
||||
for (x in list) {
|
||||
if (x == 0.0) {
|
||||
zeros = zeros + 1
|
||||
}
|
||||
}
|
||||
zeros
|
||||
}
|
||||
|
||||
|
||||
load = function(max_rows) {
|
||||
files = list.files(path=".", pattern="304NA.*")
|
||||
#max_rows = -1
|
||||
|
||||
#FREESTANDING as a filter
|
||||
#HIT_TWICE for ZEROS...
|
||||
|
||||
print ("Parsing file 1")
|
||||
t = read.table(files[1],header=T, nrows = max_rows)
|
||||
f = data.frame(loc=t$location,gc=t$gc,freestanding=t$freestanding)
|
||||
ht = data.frame(1:nrow(f))
|
||||
|
||||
for (file in files) {
|
||||
print (file)
|
||||
t = read.table(file, header=T, nrows = max_rows)
|
||||
norm_cov = t$normalized_coverage
|
||||
#names(norm_cov) = c("norm_cov.1")
|
||||
f=cbind (f, norm_cov)
|
||||
ht=cbind (ht, t$hit_twice)
|
||||
}
|
||||
|
||||
|
||||
wgs = read.table("/seq/dirseq/analysis/agilent/rt-pcr/perfdata//OV-0751-WGS.baits.coverage.txt", header=T, nrows = max_rows)
|
||||
f=cbind (f, wgs_norm_cov = wgs$normalized_coverage)
|
||||
|
||||
f=cbind(f,ht)
|
||||
|
||||
# Compute normalized variance
|
||||
print("Calculating variance")
|
||||
var = apply(f[4:10], 1, var)
|
||||
print("Calculating std. dev.")
|
||||
sd = apply(f[4:10], 1, sd)
|
||||
print("Calculating mean")
|
||||
mean = apply(f[4:10], 1, mean)
|
||||
print("Binding normalized variance")
|
||||
f=cbind (f, normvar=var/mean/mean)
|
||||
print("Binding normalized std. dev.")
|
||||
f=cbind (f, normsd=sd/mean)
|
||||
print("Binding mean")
|
||||
f=cbind (f, mean=mean)
|
||||
print("Binding std. dev.")
|
||||
f=cbind (f, sd=sd)
|
||||
print("Binding variance")
|
||||
f=cbind (f, var=var)
|
||||
|
||||
print("Calculating and binding number of zeros")
|
||||
count_zeros = apply(f[4:10], 1, count_zeros)
|
||||
num_not_hit_twice = apply(f[12:18], 1, count_zeros)
|
||||
f=cbind(f, count_zeros, num_not_hit_twice)
|
||||
|
||||
print ("Parsing sequences file")
|
||||
seqs = read.table("whole_exome_agilent_designed_120.design.1line.sorted2",header=T,nrows=max_rows)
|
||||
f=cbind (f, seqs)
|
||||
|
||||
#of = f[order(f$normvar),]
|
||||
}
|
||||
|
||||
write_splits = function(f) {
|
||||
set.seed(0987123409)
|
||||
|
||||
# Low variance
|
||||
nz = f[f$count_zeros < 1 & f$freestanding==1,] # Take reads with no zeros
|
||||
d = write_split(nz, "Low_GC_Norm_Coverage", 0.0, 0.35, 0.8, 1.2, 0.0, 0.3, 0.0)
|
||||
d = rbind(d,write_split(nz, "Mid_GC_Norm_Coverage", 0.45, 0.55, 0.8, 1.2, 0.0, 0.1, 0.0))
|
||||
d = rbind(d,write_split(nz, "High_GC_Norm_Coverage", 0.63, 1.0, 0.8, 1.2, 0.0, 0.3, 0.0))
|
||||
d = rbind(d,write_split(nz, "Low_GC_Undercovered", 0.0, 0.35, 0.2, 0.3, 0.0, 0.3, 0.0))
|
||||
d = rbind(d,write_split(nz, "Mid_GC_Undercovered", 0.45, 0.55, 0.2, 0.3, 0.0, 0.3, 0.0))
|
||||
d = rbind(d,write_split(nz, "High_GC_Undercovored", 0.63, 1.0, 0.2, 0.3, 0.0, 0.3, 0.0))
|
||||
az = f[f$count_zeros == 7 & f$freestanding==1,] # Take reads with all zeros
|
||||
d = rbind(d,write_split(az, "Low_GC_No_Coverage", 0.0, 0.35, 0.0, 0.1, -1.0, -1.0, 0.1))
|
||||
d = rbind(d,write_split(az, "Mid_GC_No_Coverage", 0.45, 0.55, 0.0, 0.1, -1.0, -1.0, 0.1))
|
||||
d = rbind(d,write_split(az, "High_GC_No_Coverage", 0.63, 1.0, 0.0, 0.1, -1.0, -1.0, 0.01))
|
||||
|
||||
# High variance
|
||||
d = rbind(d,write_split(nz, "Mid_GC_Norm_Coverage_High_Variation", 0.45, 0.55, 0.8, 1.2, 0.355, 1000.0))
|
||||
d
|
||||
}
|
||||
|
||||
write_split = function(data, label, gc_low, gc_high, cov_low, cov_high, normsd_low, normsd_high, wgs_cov_low) {
|
||||
if (normsd_high < 0.0) {
|
||||
# We have no coverage samples
|
||||
s = data[data$gc >= gc_low & data$gc <= gc_high & data$mean >= cov_low & data$mean <= cov_high & data$wgs_norm_cov >= wgs_cov_low,]
|
||||
#s = s[order(runif(nrow(s))),] # Randomize rows
|
||||
s = s[order(s$wgs_norm_cov, decreasing = T),] # order according to norm SD
|
||||
}else{
|
||||
# We have low or normal coverage samples, so take those with tightest norm SDs
|
||||
s = data[data$gc >= gc_low & data$gc <= gc_high & data$mean >= cov_low & data$mean <= cov_high & data$normsd >= normsd_low & data$normsd <= normsd_high ,]
|
||||
s = s[order(s$normsd),] # order according to norm SD
|
||||
}
|
||||
# & data$mean < 1.1 & data$mean > 0.9,]
|
||||
# & data$mean >= cov_low & data$mean <= cov_high
|
||||
#print(s)
|
||||
print(nrow(s))
|
||||
s = s[1:50, ] #-c(3,11,12:18,19,23:25)]
|
||||
s = cbind(class=rep(label,50), s)
|
||||
s
|
||||
}
|
||||
|
||||
#f=load()
|
||||
#nz=f[f$count_zeros < 1,]
|
||||
#print(summary(nz))
|
||||
|
||||
create_500 = function(f) {
|
||||
|
||||
f = load(-1)
|
||||
s = write_splits(f)
|
||||
write.csv(s, "500_exome_baits_for_nanostring.csv")
|
||||
|
||||
}
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
plot1 <- function(d, name) {
|
||||
d = subset(d, dataset == name)
|
||||
subd = data.frame(parallel.type=d$parallel.type, nWaysParallel=d$nWaysParallel, end.to.end.time=d$end.to.end.time,per.1M.sites = d$per.1M.sites, job.run.time = d$job.run.time)
|
||||
|
||||
nways = unique(subd$nWaysParallel)
|
||||
m = max(subset(subd, nWaysParallel == min(nways))$end.to.end.time)
|
||||
nNW = subset(subd, end.to.end.time == m)$nWaysParallel[1]
|
||||
timeAt1 = m * nNW
|
||||
my.runtime = subset(subd, end.to.end.time == m)$job.run.time[1] * nNW
|
||||
my.pms = subset(subd, end.to.end.time == m)$per.1M.sites[1]
|
||||
|
||||
theo = data.frame(parallel.type="theoretic", end.to.end.time=timeAt1/nways, nWaysParallel=nways, per.1M.sites = my.pms, job.run.time = my.runtime / nways)
|
||||
|
||||
subd = rbind(subd, theo)
|
||||
|
||||
print(summary(subd))
|
||||
|
||||
print(xyplot(log10(end.to.end.time) + per.1M.sites + log10(job.run.time) ~ log2(nWaysParallel), data=subd[order(subd$nWaysParallel),], group=parallel.type, type="b", outer=T, scale=list(relation="free"), auto.key=T, lwd=c(2,2,1), main=name))
|
||||
|
||||
return(subd)
|
||||
}
|
||||
|
||||
myData <- read.table("results.new.dat", header=T)
|
||||
require("lattice")
|
||||
|
||||
for (name in unique(d$dataset))
|
||||
plot1(myData, name)
|
||||
|
|
@ -1,121 +0,0 @@
|
|||
import sys
|
||||
from optparse import OptionParser
|
||||
from itertools import *
|
||||
import random
|
||||
import re
|
||||
import datetime
|
||||
|
||||
# a simple script that does:
|
||||
# 1 -- generates a master set of variants following the neutral expectation from a single big population
|
||||
# 2 -- randomly generates M individuals with variants and genotypes sampled as expected from the big population of variants
|
||||
# 3 -- writes out the genotypes of these individuals, and their allele frequency
|
||||
def main():
|
||||
global OPTIONS
|
||||
usage = "usage: %prog [options] outputFile"
|
||||
parser = OptionParser(usage=usage)
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) == 0:
|
||||
parser.error("Requires at least one argument")
|
||||
|
||||
print 'file dataset parallel.type nWaysParallel start.time end.time end.to.end.time per.1M.sites job.run.time'
|
||||
typere = '.*/(.*).ptype_(\w+).nways_(\d+).*'
|
||||
for file in args:
|
||||
startTime, endTime, perMSites, runtime = None, None, None, None
|
||||
for line in open(file):
|
||||
match = re.match(typere, line)
|
||||
if match != None: dataset, parallelType, nWays = match.groups()
|
||||
startTime = captureStartTime(line, startTime)
|
||||
perMSites = capturePerMSites(line, perMSites)
|
||||
endTime = captureEndTime(line, endTime)
|
||||
runtime = captureRuntime(line, runtime)
|
||||
print file, dataset, parallelType, nWays, formatTime(startTime), formatTime(endTime), endToEnd(endTime, startTime), perMSites, runtime
|
||||
|
||||
def endToEnd(endTime, startTime):
|
||||
if endTime < startTime:
|
||||
endTime = endTime + datetime.timedelta(1)
|
||||
#print 'endToEnd', endTime, startTime
|
||||
return total_minutes(endTime - startTime)
|
||||
|
||||
def formatTime(t):
|
||||
return datetime.datetime.strftime(t, formatString)
|
||||
|
||||
def total_minutes(td):
|
||||
return td.days * 24 * 60 + td.seconds / 60.0
|
||||
|
||||
def captureLine(line, regex, func, prevValue):
|
||||
match = regex.match(line)
|
||||
if match != None:
|
||||
if func != None:
|
||||
val = func(line)
|
||||
else:
|
||||
val = match.group(1)
|
||||
else:
|
||||
val = None
|
||||
#print 'Matching', line, regex, match, prevValue, val
|
||||
|
||||
return val
|
||||
|
||||
formatString = "%H:%M:%S"
|
||||
|
||||
def captureStartTime(line, prev):
|
||||
# todo - needs to find the earliest time
|
||||
#INFO 11:03:50,202 HelpFormatter - The Genome Analysis Toolkit (GATK) v<unknown>, Compiled <unknown>
|
||||
regex = re.compile("INFO\W*(\d+:\d+:\d+).*The Genome Analysis Toolkit.*")
|
||||
return selectTime(captureLine(line, regex, None, prev), prev, earlier = True)
|
||||
|
||||
def selectTime(newTimeString, oldTime, earlier = False):
|
||||
def select():
|
||||
if newTimeString == None:
|
||||
return oldTime
|
||||
else:
|
||||
newTime = datetime.datetime.strptime(newTimeString, formatString)
|
||||
if oldTime == None:
|
||||
return newTime
|
||||
elif earlier:
|
||||
if newTime < oldTime:
|
||||
return newTime
|
||||
else:
|
||||
return oldTime
|
||||
else:
|
||||
if newTime > oldTime:
|
||||
return newTime
|
||||
else:
|
||||
return oldTime
|
||||
r = select()
|
||||
#if not earlier: print 'selectTime', oldTime, newTimeString, r
|
||||
return r
|
||||
|
||||
|
||||
def captureEndTime(line, prev):
|
||||
# todo - needs to find the latest time
|
||||
regex = re.compile("INFO\W*(\d+:\d+:\d+).*GATKRunReport - Aggregating data for run report.*")
|
||||
return selectTime(captureLine(line, regex, None, prev), prev, earlier=False)
|
||||
|
||||
unitsToMinutes = {
|
||||
'm' : 1.0,
|
||||
'h' : 60,
|
||||
's' : 1.0/60,
|
||||
'd' : 60 * 60
|
||||
}
|
||||
|
||||
def capturePerMSites(line, prev):
|
||||
return captureDoneLine(line, prev, 8, 10)
|
||||
|
||||
def captureRuntime(line, prev):
|
||||
return captureDoneLine(line, prev, 6, 8)
|
||||
|
||||
def captureDoneLine(line, prev, s, e):
|
||||
# INFO 11:04:11,541 TraversalEngine - chr1:3769010 1.32e+05 20.0 s 2.5 m 1.5% 21.9 m 21.5 m
|
||||
regex = re.compile("INFO .*TraversalEngine -.*done*")
|
||||
val = captureLine(line, regex, lambda x: x.split()[s:e], None)
|
||||
if val == None:
|
||||
return prev
|
||||
else:
|
||||
x, u = val
|
||||
return float(x) * unitsToMinutes[u]
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,200 +0,0 @@
|
|||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
|
||||
class DistributedGATKPerformance extends QScript {
|
||||
qscript =>
|
||||
|
||||
@Argument(shortName="gatk", doc="gatk jar file", required=true)
|
||||
var gatkJarFile: File = _
|
||||
|
||||
@Argument(shortName="outputDir", doc="output directory", required=false)
|
||||
var outputDir: String = ""
|
||||
|
||||
@Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
|
||||
var datasets: List[String] = Nil
|
||||
|
||||
@Argument(shortName="waysParallel", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
|
||||
var waysParallelArg: List[Int] = Nil
|
||||
|
||||
@Argument(shortName="long", doc="runs long calculations", required=false)
|
||||
var long: Boolean = false
|
||||
|
||||
@Argument(shortName="test", doc="runs long calculations", required=false)
|
||||
var test: Boolean = false
|
||||
|
||||
@Argument(shortName="limitTo30Min", doc="runs long calculations", required=false)
|
||||
var limitTo30Min: Boolean = false
|
||||
|
||||
@Argument(shortName="huge", doc="runs long calculations", required=false)
|
||||
var huge: Int = -1
|
||||
|
||||
@Argument(shortName="justDist", doc="runs long calculations", required=false)
|
||||
var justDist: Boolean = false
|
||||
|
||||
@Argument(shortName="justSG", doc="runs long calculations", required=false)
|
||||
var justSG: Boolean = false
|
||||
|
||||
@Argument(shortName="trackerDir", doc="root directory for distributed tracker files", required=false)
|
||||
var trackerDir: String = "" // "/humgen/gsa-scr1/depristo/tmp/"
|
||||
|
||||
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "DEBUG"; jarFile = gatkJarFile; memoryLimit = 2; }
|
||||
|
||||
class Target(
|
||||
val baseName: String,
|
||||
val reference: File,
|
||||
val dbsnpFile: String,
|
||||
val hapmapFile: String,
|
||||
val maskFile: String,
|
||||
val bamList: File,
|
||||
val goldStandard_VCF: File,
|
||||
val intervals: String,
|
||||
val titvTarget: Double,
|
||||
val isLowpass: Boolean,
|
||||
val useBAQ: Boolean) {
|
||||
val name = qscript.outputDir + baseName
|
||||
val clusterFile = new File(name + ".clusters")
|
||||
def rawVCF(part: String) = new File(name + "." + part + ".raw.vcf")
|
||||
val filteredVCF = new File(name + ".filtered.vcf")
|
||||
val titvRecalibratedVCF = new File(name + ".titv.recalibrated.vcf")
|
||||
val tsRecalibratedVCF = new File(name + ".ts.recalibrated.vcf")
|
||||
val goldStandardName = qscript.outputDir + "goldStandard/" + baseName
|
||||
val goldStandardClusterFile = new File(goldStandardName + ".clusters")
|
||||
}
|
||||
|
||||
val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
|
||||
val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta")
|
||||
val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||
val dbSNP_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_hg18.rod"
|
||||
val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_b36.rod"
|
||||
val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf"
|
||||
val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf"
|
||||
val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf"
|
||||
val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"
|
||||
val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed"
|
||||
val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed"
|
||||
|
||||
// ToDos:
|
||||
// reduce the scope of the datasets so the script is more nimble
|
||||
// figure out how to give names to all the Queue-LSF logs (other than Q-1931@node1434-24.out) so that it is easier to find logs for certain steps
|
||||
// create gold standard BAQ'd bam files, no reason to always do it on the fly
|
||||
|
||||
// Analysis to add at the end of the script:
|
||||
// auto generation of the cluster plots
|
||||
// spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls
|
||||
// produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF
|
||||
|
||||
val lowPass: Boolean = true
|
||||
|
||||
val targetDataSets: Map[String, Target] = Map(
|
||||
"HiSeq" -> new Target("NA12878.HiSeq", hg18, dbSNP_hg18, hapmap_hg18,
|
||||
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask",
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"),
|
||||
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"),
|
||||
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/whole_genome_chunked.hg18.intervals", 2.07, !lowPass, true),
|
||||
"FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass, true),
|
||||
"WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18, hapmap_hg18,
|
||||
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask",
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"),
|
||||
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"),
|
||||
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, !lowPass, true),
|
||||
"TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/Barcoded_1000G_WEx_Reduced_Plate_1.20.cleaned.list"), // BUGBUG: reduce from 60 to 20 people
|
||||
new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass, true),
|
||||
"LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36,
|
||||
new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from
|
||||
new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, lowPass,true), // chunked interval list to use with Queue's scatter/gather functionality
|
||||
"LowPassAugust" -> new Target("ALL.august.v4", b37, dbSNP_b37, hapmap_b37, indelMask_b37, // BUGBUG: kill this, it is too large
|
||||
new File("/humgen/1kg/processing/allPopulations_chr20_august_release.cleaned.merged.bams/ALL.cleaned.merged.list"),
|
||||
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass, true),
|
||||
"LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass,false),
|
||||
"WExTrio" -> new Target("NA12878Trio.WEx", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bams.list"),
|
||||
new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass, true)
|
||||
)
|
||||
|
||||
def getTargetInterval(target: Target): List[String] = target.name match {
|
||||
case "NA12878.HiSeq" => List("chr1")
|
||||
case "FIN" => List("20")
|
||||
case "ALL.august.v4" => List("20")
|
||||
case "EUR.nov2010" => List("20")
|
||||
case _ => List(target.intervals)
|
||||
}
|
||||
|
||||
def script = {
|
||||
|
||||
// Selects the datasets in the -dataset argument and adds them to targets.
|
||||
var targets: List[Target] = List()
|
||||
if (!datasets.isEmpty)
|
||||
for (ds <- datasets)
|
||||
targets ::= targetDataSets(ds) // Could check if ds was mispelled, but this way an exception will be thrown, maybe it's better this way?
|
||||
else // If -dataset is not specified, all datasets are used.
|
||||
for (targetDS <- targetDataSets.valuesIterator) // for Scala 2.7 or older, use targetDataSets.values
|
||||
targets ::= targetDS
|
||||
|
||||
val nWays = if ( test ) List(32) else { if ( long ) List(1,2,4,8) else if ( huge != -1 ) List(huge) else List(16,32,64,128) }
|
||||
//val nWays = List(2)
|
||||
|
||||
for (target <- targets) {
|
||||
for ( scatterP <- if ( test ) List(false) else if ( justSG ) List(true) else if ( justDist ) List(false) else List(true, false) )
|
||||
for (nWaysParallel <- nWays ) {
|
||||
val aname = "ptype_%s.nways_%d".format(if ( scatterP ) "sg" else "dist", nWaysParallel)
|
||||
|
||||
def addUG(ug: UnifiedGenotyper) = {
|
||||
if ( ! long )
|
||||
ug.jobLimitSeconds = 60 * 60 * 4
|
||||
if ( limitTo30Min )
|
||||
ug.jobLimitSeconds = 60 * 30
|
||||
add(ug);
|
||||
}
|
||||
|
||||
// add scatter/gather or distributed parallelism
|
||||
if ( scatterP ) {
|
||||
var ug: UnifiedGenotyper = new UnifiedGenotyper(target, aname)
|
||||
ug.scatterCount = nWaysParallel
|
||||
ug.intervalsString ++= List(target.intervals)
|
||||
addUG(ug)
|
||||
} else {
|
||||
for ( part <- 1 to nWaysParallel) {
|
||||
var ug: UnifiedGenotyper = new UnifiedGenotyper(target, aname + ".part" + part)
|
||||
ug.intervalsString ++= getTargetInterval(target)
|
||||
ug.processingTracker = new File(trackerDir + target.name + "." + aname + ".distributed.txt")
|
||||
ug.processingTrackerID = part
|
||||
if ( part == 1 )
|
||||
ug.performanceLog = new File("%s.%s.pf.log".format(target.name, aname))
|
||||
ug.processingTrackerStatusFile = new File("%s.%s.%d.ptstatus.log".format(target.name, aname, part))
|
||||
addUG(ug)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 1.) Call SNPs with UG
|
||||
class UnifiedGenotyper(t: Target, aname: String) extends org.broadinstitute.sting.queue.extensions.gatk.UnifiedGenotyper with UNIVERSAL_GATK_ARGS {
|
||||
this.reference_sequence = t.reference
|
||||
this.dcov = if ( t.isLowpass ) { 50 } else { 250 }
|
||||
this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
|
||||
this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
|
||||
this.input_file :+= t.bamList
|
||||
this.out = t.rawVCF(aname)
|
||||
this.baq = if (t.useBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF}
|
||||
this.analysisName = t.name + "_UG." + aname
|
||||
if (t.dbsnpFile.endsWith(".rod"))
|
||||
this.DBSNP = new File(t.dbsnpFile)
|
||||
else if (t.dbsnpFile.endsWith(".vcf"))
|
||||
this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
#d <- read.table("../GATK/trunk/timer.dat", header=T)
|
||||
require("lattice")
|
||||
print(xyplot(elapsed.time + delta ~ cycle | name, data=d, scales=list(relation="free"), auto.key=T, type="b", outer=T))
|
||||
|
|
@ -1 +0,0 @@
|
|||
grep -l -e "ptype_sg" -e "part1\." short/Q-*.out long/Q-*.out > toTime.txt
|
||||
|
|
@ -1 +0,0 @@
|
|||
echo "63025520" | awk '{ for(i = 0; i < $1; i += 100000) {print "20:" i+1 "-" (i+100000 < $1 ? i+100000 : $1)}}' > whole_genome_chunked.chr20.hg19.intervals
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
JOB_START_RATE = 0.1 # chance of starting is 0.1
|
||||
WORK_UNITS = 100
|
||||
WORK_RATE = 1
|
||||
N_TICKS = 300
|
||||
|
||||
ticks <- 1:N_TICKS
|
||||
|
||||
# the probability that a job starts at exactly tick i
|
||||
pThreadStartAtTick <- function(i) {
|
||||
dexp(i, JOB_START_RATE)
|
||||
}
|
||||
|
||||
jobDoneByI <- function(i) {
|
||||
return(sapply(i - ticks, function(x) max(x, 0)) * WORK_RATE)
|
||||
#return(pCompleteAtI(i, pStarts, ticks))
|
||||
}
|
||||
|
||||
pThreadDoneByI <- function(i) {
|
||||
pStarts <- pThreadStartAtTick(ticks)
|
||||
workDoneByThreadStartingAtI <- jobDoneByI(i)
|
||||
fracDone <- workDoneByThreadStartingAtI / WORK_UNITS
|
||||
doneAtI <- fracDone >= 1
|
||||
return(sum(pStarts * doneAtI))
|
||||
}
|
||||
|
||||
pThreadsDoneByI <- function(i, nThreads) {
|
||||
pDone <- rep(0, N_TICKS)
|
||||
for ( thread : 1:nThreads )
|
||||
pDone <- pPrevThreadsNotDoneAtI(pDone, i) + pThreadDoneByI(i)
|
||||
}
|
||||
|
||||
#plot(ticks, workDoneByI(100))
|
||||
plot(ticks, sapply(ticks, function(i) pThreadDoneByI(i)))
|
||||
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
#!/bin/tcsh
|
||||
|
||||
setenv CMD "java -Djava.io.tmpdir=/broad/shptmp/depristo/tmp -jar /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/Queue.jar -statusTo depristo -S /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/analysis/depristo/distributedGATK/distributedGATKPerformance.scala -bsub --gatkjarfile /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar -dataset HiSeq $argv[2-$#argv]"
|
||||
|
||||
if ( $1 == 1 ) then
|
||||
pushd short; $CMD -jobQueue hour -run &
|
||||
else if ( $1 == 2 ) then
|
||||
pushd long; $CMD -jobQueue gsa -long -run &
|
||||
else
|
||||
$CMD
|
||||
endif
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
require("lattice")
|
||||
require("ggplot2")
|
||||
require("splines")
|
||||
|
||||
ymax = xmax = 30
|
||||
HAVE_RAW_DATA = F
|
||||
if ( HAVE_RAW_DATA ) {
|
||||
inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table"
|
||||
#inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/cgl.table.gz"
|
||||
r <- digestTable(inputDataFile)
|
||||
d = r$d
|
||||
eByComp = r$eByComp
|
||||
countsByTech = addEmpiricalPofG(ddply(d, .(ref, alt, technology, pGGivenDType, pGGivenD), genotypeCounts))
|
||||
print(qplot(pGGivenD, EmpiricalPofGQ, data=subset(countsByTech, technology=="HiSeq-paper" & pGGivenDType == "QofABGivenD"), facets = alt ~ ref, color=alt, geom=c("point"), group=alt, xlim=c(0,xmax), ylim=c(0,ymax))
|
||||
+ geom_abline(slope=1, linetype=2))
|
||||
# + geom_smooth(se=T, size=1.5, aes(weight=Sum)))
|
||||
} else {
|
||||
eByComp = read.table("~/Dropbox/GSA members/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table.eByComp.tsv", header=T)
|
||||
}
|
||||
|
||||
#print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD"))
|
||||
#print(subset(eByComp, EmpiricalPofGQ < Inf))
|
||||
|
||||
goodEByComp = subset(eByComp, Sum > 10 & EmpiricalPofGQ < Inf)
|
||||
|
||||
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, size=log10(Sum), facets = pGGivenDType ~ technology, color=pGGivenDType, geom=c("point", "smooth"), group=pGGivenDType, xlim=c(0,xmax), ylim=c(0,ymax)) + geom_abline(slope=1, linetype=2))
|
||||
|
||||
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(0,ymax))
|
||||
+ geom_abline(slope=1, linetype=2)
|
||||
+ geom_smooth(se=F, aes(weight=Sum)))
|
||||
|
||||
print(qplot(pGGivenD, pGGivenD - EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(-10,10))
|
||||
+ geom_abline(slope=0, linetype=2)
|
||||
+ geom_smooth(se=F, method=lm, formula = y ~ ns(x,1), aes(weight=Sum)))
|
||||
|
||||
# By tech
|
||||
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ ., color=technology, geom=c("blank"), group=technology, xlim=c(0,xmax), ylim=c(0,ymax))
|
||||
+ geom_abline(slope=1, linetype=2)
|
||||
+ geom_smooth(se=T, size=1.5, aes(weight=Sum)))
|
||||
|
||||
|
|
@ -1,62 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
require("ggplot2")
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
inputDataFile = args[1]
|
||||
onCmdLine = ! is.na(inputDataFile)
|
||||
|
||||
addEmpiricalPofG <- function(d) {
|
||||
r = c()
|
||||
#
|
||||
# TODO -- this is a really naive estimate of the accuracy, as it assumes the comp
|
||||
# track is perfect. In reality the chip is at best Q30 accurate (replicate samples have
|
||||
# level than this level of concordance). At low incoming confidence, we can effectively
|
||||
# ignore this term but when the incoming Q is near or above Q30 this approximation clearly
|
||||
# breaks down.
|
||||
#
|
||||
for ( i in 1:dim(d)[1] ) {
|
||||
row = d[i,]
|
||||
if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF
|
||||
if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET
|
||||
if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR
|
||||
r = c(r, v / row$Sum)
|
||||
}
|
||||
|
||||
#print(length(r))
|
||||
d$EmpiricalPofG = r
|
||||
d$EmpiricalPofGQ = round(-10*log10(1-r))
|
||||
return(d)
|
||||
}
|
||||
|
||||
genotypeCounts <- function(x) {
|
||||
type = unique(x$variable)[1]
|
||||
t = addmargins(table(x$comp))
|
||||
return(t)
|
||||
}
|
||||
|
||||
|
||||
digestTable <- function(inputDataFile) {
|
||||
d = subset(read.table(inputDataFile, header=T), rg != "ALL")
|
||||
d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent"))
|
||||
d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G"
|
||||
d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper"
|
||||
d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent"
|
||||
print(summary(d$technology))
|
||||
|
||||
eByComp = addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts))
|
||||
return(list(d=d, eByComp = eByComp))
|
||||
#countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts))
|
||||
}
|
||||
|
||||
writeMyTable <- function(t, name) {
|
||||
write.table(t,file=paste(inputDataFile, ".", name, ".tsv", sep=""))
|
||||
}
|
||||
|
||||
if ( onCmdLine ) {
|
||||
r <- digestTable(inputDataFile)
|
||||
writeMyTable(r$eByComp, "eByComp")
|
||||
}
|
||||
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
{
|
||||
"Statement": [
|
||||
{
|
||||
"Sid": "Stmt1296439478068",
|
||||
"Action": [
|
||||
"s3:PutObject"
|
||||
],
|
||||
"Effect": "Allow",
|
||||
"Resource": "arn:aws:s3:::GATK_Run_Reports/*"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
AKIAJXU7VIHBPDW4TDSQ
|
||||
uQLTduhK6Gy8mbOycpoZIxr8ZoVj1SQaglTWjpYA
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
{
|
||||
"Statement":[{
|
||||
"Effect":"Allow",
|
||||
"Action":"*",
|
||||
"Resource":"*"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
{
|
||||
"Statement": [
|
||||
{
|
||||
"Sid": "Stmt1296439478068",
|
||||
"Action": [
|
||||
"s3:PutObject"
|
||||
],
|
||||
"Effect": "Allow",
|
||||
"Resource": "arn:aws:s3:::IGV_crowdsourcing/*"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
AKIAIM64MSUYNQ2465HQ
|
||||
D+l3HfPQFWia9HF8rKh/fJ5+yNYsltWUpj0C7L0Z
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
#!/bin/tcsh
|
||||
|
||||
# download CLI tools
|
||||
# http://aws.amazon.com/developertools/AWS-Identity-and-Access-Management/4143
|
||||
|
||||
setenv JAVA_HOME /usr/
|
||||
setenv AWS_IAM_HOME ~/Downloads/IAMCli-1.1.0
|
||||
setenv PATH $AWS_IAM_HOME/bin:$PATH
|
||||
setenv AWS_CREDENTIAL_FILE /Users/depristo/Desktop/broadLocal/GATK/trunk/account-key
|
||||
|
||||
setenv CREATE_GROUPS false
|
||||
setenv CREATE_IGV_USER false
|
||||
setenv UPDATE_USER_KEYS false
|
||||
setenv UPDATE_USER_POLICY true
|
||||
|
||||
# Create the administrators group:
|
||||
# we aren't actually using this, in fact
|
||||
if ( $CREATE_GROUPS == true ) then
|
||||
iam-groupcreate -g Admins
|
||||
iam-grouplistbypath
|
||||
iam-groupuploadpolicy -g Admins -p AdminsGroupPolicy -f GroupPolicy.txt
|
||||
iam-grouplistpolicies -g Admins
|
||||
endif
|
||||
|
||||
# Create the IGV user -- uncomment if the IGV user needs to be created from scratch
|
||||
# update the secret key
|
||||
if $CREATE_IGV_USER == true then
|
||||
iam-usercreate -u IGV -k -v > IGV_cred.txt
|
||||
endif
|
||||
|
||||
# the user access and secret keys are in the IGV source file IGVRunReport.java
|
||||
# and must be updated to be the most current ones
|
||||
if $UPDATE_USER_KEYS == true then
|
||||
iam-userdelkey -u IGV -k $1 # $1 -> current access key
|
||||
iam-useraddkey -u IGV > IGV_cred.txt
|
||||
cat IGV_cred.txt
|
||||
endif
|
||||
|
||||
echo "IGV user policies"
|
||||
if $UPDATE_USER_POLICY == true then
|
||||
echo "Deleting policy"
|
||||
iam-userdelpolicy -u IGV -p IGVRunReportUploading
|
||||
iam-useruploadpolicy -u IGV -p IGVRunReportUploading -f IGVPolicy.txt
|
||||
endif
|
||||
iam-userlistpolicies -u IGV -v
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
#!/bin/tcsh
|
||||
|
||||
# download CLI tools
|
||||
# http://aws.amazon.com/developertools/AWS-Identity-and-Access-Management/4143
|
||||
|
||||
setenv JAVA_HOME /usr/
|
||||
setenv AWS_IAM_HOME ~/Downloads/IAMCli-1.1.0
|
||||
setenv PATH $AWS_IAM_HOME/bin:$PATH
|
||||
setenv AWS_CREDENTIAL_FILE /Users/depristo/Desktop/broadLocal/GATK/trunk/account-key
|
||||
|
||||
setenv CREATE_GROUPS false
|
||||
setenv CREATE_GATK_USER false
|
||||
setenv UPDATE_USER_KEYS false
|
||||
setenv UPDATE_USER_POLICY true
|
||||
|
||||
# Create the administrators group:
|
||||
# we aren't actually using this, in fact
|
||||
if ( $CREATE_GROUPS == true ) then
|
||||
iam-groupcreate -g Admins
|
||||
iam-grouplistbypath
|
||||
iam-groupuploadpolicy -g Admins -p AdminsGroupPolicy -f GroupPolicy.txt
|
||||
iam-grouplistpolicies -g Admins
|
||||
endif
|
||||
|
||||
# Create the GATK user -- uncomment if the GATK user needs to be created from scratch
|
||||
# update the secret key
|
||||
if $CREATE_GATK_USER == true then
|
||||
iam-usercreate -u GATK -k -v > GATK_cred.txt
|
||||
endif
|
||||
|
||||
# the user access and secret keys are in the GATK source file GATKRunReport.java
|
||||
# and must be updated to be the most current ones
|
||||
if $UPDATE_USER_KEYS == true then
|
||||
iam-userdelkey -u GATK -k $1 # $1 -> current access key
|
||||
iam-useraddkey -u GATK > GATK_cred.txt
|
||||
cat GATK_cred.txt
|
||||
endif
|
||||
|
||||
echo "GATK user policies"
|
||||
if $UPDATE_USER_POLICY == true then
|
||||
echo "Deleting policy"
|
||||
iam-userdelpolicy -u GATK -p GATKRunReportUploading
|
||||
iam-useruploadpolicy -u GATK -p GATKRunReportUploading -f GATKPolicy.txt
|
||||
endif
|
||||
iam-userlistpolicies -u GATK -v
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
#!/broad/tools/apps/R-2.6.0/bin/Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
input = args[1]
|
||||
|
||||
t=read.table(input, header=T)
|
||||
#t=read.csv(input)
|
||||
#par(mfrow=c(2,1), cex=1.2)
|
||||
|
||||
#outfile = paste(input, ".quality_emp_v_stated.png", sep="")
|
||||
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
|
||||
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
d.good <- t[t$nMismatches >= 1000,]
|
||||
d.100 <- t[t$nMismatches < 100,]
|
||||
d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,]
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,63), ylim=c(0,63), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
|
||||
points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16)
|
||||
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16)
|
||||
abline(0,1, lty=2)
|
||||
dev.off()
|
||||
|
||||
#outfile = paste(input, ".quality_emp_hist.png", sep="")
|
||||
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
|
||||
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
|
||||
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Empirical quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot Q reported histogram
|
||||
#
|
||||
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(t$Qreported, t$nBases), t.nBases != 0)
|
||||
plot(hst$t.Qreported, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Reported quality score histogram", xlab="Qreported quality score", ylab="Count", yaxt="n")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
#!/broad/tools/apps/R-2.6.0/bin/Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
|
||||
#X11(width=7, height=14)
|
||||
#outfile = paste(input, ".qual_diff_v_cycle.png", sep="")
|
||||
#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
|
||||
outfile = paste(input, ".qual_diff_v_cycle.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
c <- read.table(input, header=T)
|
||||
d.good <- c[c$nMismatches >= 100,]
|
||||
d.100 <- c[c$nMismatches < 100,]
|
||||
plot(d.good$Cycle, d.good$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", col="blue", ylim=c(-10, 10))
|
||||
points(d.100$Cycle, d.100$Qempirical_Qreported, type="p", col="lightblue", pch=3)
|
||||
#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16)
|
||||
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
#!/broad/tools/apps/R-2.6.0/bin/Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
|
||||
#outfile = paste(input, ".qual_diff_v_dinuc.png", sep="")
|
||||
#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
|
||||
outfile = paste(input, ".qual_diff_v_dinuc.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
#in_dinuc = paste(input, ".quality_difference_v_dinucleotide.csv", sep="")
|
||||
#d <- read.csv(input)
|
||||
d <- read.table(input, header=T)
|
||||
plot(d$Dinuc, d$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Dinucleotide", ylim=c(-10,10))
|
||||
|
|
@ -1,273 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.varianteval;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.report.tags.Analysis;
|
||||
import org.broadinstitute.sting.utils.report.tags.DataPoint;
|
||||
import org.broadinstitute.sting.utils.report.utils.TableType;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Nov 22, 2010
|
||||
* Time: 12:22:08 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
@Analysis(name = "ACTransitionMatrix", description = "Number of additional genotypes from each new sample; random permutations")
|
||||
public class ACTransitionTable extends VariantEvaluator {
|
||||
private final int NUM_PERMUTATIONS = 50;
|
||||
private final double LOW_GQ_PCT = 0.95;
|
||||
private final double LOW_GQ_THRSH = 30.0;
|
||||
private boolean initialized = false;
|
||||
private long skipped = 0l;
|
||||
|
||||
@DataPoint(name="Het transitions",description="AC[s] = AC[s-1]+1 and AC[s] = AC[s-1]+2 transitions")
|
||||
TransitionTable transitions = null;
|
||||
@DataPoint(name="Private permutations",description="Marginal increase in number of sites per sample")
|
||||
PermutationCounts privatePermutations;
|
||||
@DataPoint(name="AC2 Permutations",description="Marginal increase in number of AC=2 sites, per sample")
|
||||
PermutationCounts doubletonPermutations;
|
||||
@DataPoint(name="AC3 Permutations",description="Marginal increase in number of tripleton sites, per sample")
|
||||
PermutationCounts tripletonPermutations;
|
||||
|
||||
String[][] permutations;
|
||||
|
||||
public boolean enabled() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public int getComparisonOrder() {
|
||||
return 2;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "ACTransitionTable";
|
||||
}
|
||||
|
||||
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( eval != null && ! initialized ) {
|
||||
//this.veWalker.getLogger().warn("Initializing...");
|
||||
initialize(eval);
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
if ( isGood(eval) ) {
|
||||
if ( comp != null && ! comp.isFiltered() ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
int order_offset = 0;
|
||||
for ( String[] ordering : permutations ) {
|
||||
int sample_offset = 0;
|
||||
int variant_ac = 0;
|
||||
for ( String sample : ordering ) {
|
||||
if ( eval.getGenotype(sample).isHet() ) {
|
||||
variant_ac++;
|
||||
transitions.hetTransitionCounts[order_offset][variant_ac-1][sample_offset]++;
|
||||
} else if ( eval.getGenotype(sample).isHomVar() ) {
|
||||
variant_ac += 2;
|
||||
transitions.homTransitionCounts[order_offset][variant_ac-1][sample_offset]++;
|
||||
} else {
|
||||
// todo -- note, unclear how to treat no calls. Is the hom in het,ref,ref,nocall,hom sample 4 or 5?
|
||||
// todo -- do we want to tabulate P[sample i is not variant | some variant]? This is just combinatorics so i left it out
|
||||
if ( variant_ac > 0 ) {
|
||||
transitions.stationaryCounts[order_offset][variant_ac-1][sample_offset]++;
|
||||
}
|
||||
}
|
||||
sample_offset ++;
|
||||
}
|
||||
order_offset++;
|
||||
}
|
||||
} else {
|
||||
skipped++;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean isGood(VariantContext vc) {
|
||||
if ( vc == null || vc.isFiltered() || (vc.getHetCount() + vc.getHomVarCount() == 0) ) { // todo -- should be is variant, but need to ensure no alt alleles at ref sites
|
||||
return false;
|
||||
} else {
|
||||
Collection<Genotype> gtypes = vc.getGenotypes().values();
|
||||
int ngood = 0;
|
||||
for ( Genotype g : gtypes) {
|
||||
if ( g.isCalled() && g.getPhredScaledQual() >= LOW_GQ_THRSH ) {
|
||||
ngood ++;
|
||||
}
|
||||
}
|
||||
|
||||
return ( (0.0+ngood)/(0.0+gtypes.size()) >= LOW_GQ_PCT );
|
||||
}
|
||||
}
|
||||
|
||||
public ACTransitionTable(VariantEvalWalker parent) {
|
||||
//super(parent);
|
||||
}
|
||||
|
||||
public void initialize(VariantContext vc) {
|
||||
Set<String> permuteSamples = vc.getSampleNames();
|
||||
permutations = new String[NUM_PERMUTATIONS][permuteSamples.size()];
|
||||
//veWalker.getLogger().warn(String.format("Num samples: %d",permuteSamples.size()));
|
||||
int offset = 0;
|
||||
for ( String s : permuteSamples ) {
|
||||
permutations[0][offset] = s;
|
||||
offset ++;
|
||||
}
|
||||
|
||||
for ( int p = 1; p < NUM_PERMUTATIONS ; p++ ) {
|
||||
permutations[p] = permutations[0].clone();
|
||||
for ( int o = 0; o < permutations[p].length; o ++ ) {
|
||||
int r = (int) Math.floor(Math.random()*(o+1));
|
||||
String swap = permutations[p][r];
|
||||
permutations[p][r] = permutations[p][o];
|
||||
permutations[p][o] = swap;
|
||||
}
|
||||
}
|
||||
|
||||
transitions = new TransitionTable();
|
||||
transitions.hetTransitionCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()*2][permuteSamples.size()];
|
||||
transitions.homTransitionCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()*2][permuteSamples.size()];
|
||||
transitions.stationaryCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()*2][permuteSamples.size()];
|
||||
privatePermutations = new PermutationCounts(1,transitions);
|
||||
doubletonPermutations = new PermutationCounts(2,transitions);
|
||||
tripletonPermutations = new PermutationCounts(3,transitions);
|
||||
}
|
||||
|
||||
public void finalizeEvaluation() { // note: data points are null when this is called (wtf?)
|
||||
//veWalker.getLogger().info(String.format("Skipped: %d",skipped));
|
||||
}
|
||||
|
||||
class TransitionTable implements TableType {
|
||||
int[][][] hetTransitionCounts;
|
||||
int[][][] homTransitionCounts;
|
||||
int[][][] stationaryCounts;
|
||||
String[][] countAverages;
|
||||
String[] rowKeys = null;
|
||||
String[] colKeys = null;
|
||||
|
||||
public Object[] getRowKeys() {
|
||||
if ( rowKeys == null ) {
|
||||
rowKeys = new String[3*hetTransitionCounts[0].length];
|
||||
for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) {
|
||||
rowKeys[i] = String.format("%s%d%s","AC_",i,"_(het)");
|
||||
}
|
||||
for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) {
|
||||
rowKeys[hetTransitionCounts[0].length+i] = String.format("%s%d%s","AC_",i,"_(hom)");
|
||||
}
|
||||
for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) {
|
||||
rowKeys[2*hetTransitionCounts[0].length+i] = String.format("%s%d%s","AC_",i,"_(ref)");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return rowKeys;
|
||||
}
|
||||
|
||||
public String getCell(int x, int y) {
|
||||
if ( countAverages == null ) {
|
||||
countAverages = new String[hetTransitionCounts[0].length*3][hetTransitionCounts[0][0].length];
|
||||
for ( int sam = 0; sam < hetTransitionCounts[0][0].length; sam ++) {
|
||||
for ( int idx = 0 ; idx < hetTransitionCounts[0].length; idx ++ ) {
|
||||
int totalTimesAtACSample = 0;
|
||||
int totalStationary = 0;
|
||||
int totalAC1Shift = 0;
|
||||
int totalAC2Shift = 0;
|
||||
for ( int p = 0; p < hetTransitionCounts.length; p++ ) {
|
||||
totalStationary += stationaryCounts[p][idx][sam];
|
||||
totalAC2Shift += (idx+2 >= hetTransitionCounts[0][0].length) ? 0 : homTransitionCounts[p][idx+2][sam];
|
||||
totalAC1Shift += (idx+1 >= hetTransitionCounts[0][0].length) ? 0 : hetTransitionCounts[p][idx+1][sam];
|
||||
}
|
||||
totalTimesAtACSample = totalStationary+totalAC1Shift+totalAC2Shift;
|
||||
countAverages[idx][sam] = formatProp(totalAC1Shift,totalTimesAtACSample);
|
||||
countAverages[hetTransitionCounts[0].length+idx][sam] = formatProp(totalAC2Shift,totalTimesAtACSample);
|
||||
countAverages[hetTransitionCounts[0].length*2+idx][sam] = formatProp(totalStationary,totalTimesAtACSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return countAverages[x][y] == null ? "0.00" : countAverages[x][y];
|
||||
}
|
||||
|
||||
private String formatProp(int num, int denom) {
|
||||
return (denom != 0) ? String.format("%.4f", ((double) num)/denom) : "0.0";
|
||||
}
|
||||
|
||||
public String getName() { return "AC Transition Tables"; }
|
||||
|
||||
public Object[] getColumnKeys() {
|
||||
if ( colKeys == null ) {
|
||||
colKeys = new String[hetTransitionCounts[0][0].length];
|
||||
for ( int ac = 0; ac < hetTransitionCounts[0][0].length; ac ++ ) {
|
||||
colKeys[ac] = String.format("Sample_%d",ac);
|
||||
}
|
||||
}
|
||||
|
||||
return colKeys;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class PermutationCounts implements TableType {
|
||||
int acToExtract;
|
||||
TransitionTable table;
|
||||
String[] rowNames;
|
||||
String[] colNames;
|
||||
|
||||
public PermutationCounts(int ac, TransitionTable tTable) {
|
||||
acToExtract = ac;
|
||||
table = tTable;
|
||||
}
|
||||
|
||||
public String[] getRowKeys() {
|
||||
//System.out.printf("%s%n",table);
|
||||
if ( rowNames == null ) {
|
||||
rowNames = new String[table.stationaryCounts.length];
|
||||
for ( int p = 0 ; p < rowNames.length; p ++ ) {
|
||||
rowNames[p] = String.format("Perm%d",p+1);
|
||||
}
|
||||
}
|
||||
|
||||
return rowNames;
|
||||
}
|
||||
|
||||
public String[] getColumnKeys() {
|
||||
if ( colNames == null ) {
|
||||
colNames = new String[table.stationaryCounts[0][0].length];
|
||||
for ( int s = 0 ; s < colNames.length; s ++ ) {
|
||||
colNames[s] = String.format("Sample%d",s+1);
|
||||
}
|
||||
}
|
||||
|
||||
return colNames;
|
||||
}
|
||||
|
||||
public Integer getCell(int x, int y) {
|
||||
return table.hetTransitionCounts[x][acToExtract-1][y] +
|
||||
( (acToExtract > table.homTransitionCounts[0][0].length) ? 0 : table.homTransitionCounts[x][acToExtract-1][y]);
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return String.format("PermutationCountsAC%d",acToExtract);
|
||||
}
|
||||
|
||||
public void init() {
|
||||
getRowKeys();
|
||||
getColumnKeys();
|
||||
getCell(1,1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -1,85 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.diagnostics;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodGenotypeChipAsGFF;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
/**
|
||||
* Takes a BAM file and a Hapmap-chip file (via the -hc argument) and creates a table of reference allele
|
||||
* percentage and alternate allele percentage for het, homvar, and other genotypes.
|
||||
*/
|
||||
public class AlleleBalanceInspector extends LocusWalker<Integer, Integer> {
|
||||
private int item = 1;
|
||||
public void initialize() {
|
||||
out.printf("item\tlocus\tref\tgenotype\tstate\tdepth\trefdepth\taltdepth\trefpct\taltpct%n");
|
||||
}
|
||||
|
||||
public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
RodGenotypeChipAsGFF hc = tracker.lookup("child",RodGenotypeChipAsGFF.class);
|
||||
|
||||
return hc != null && hc.getCalledGenotype().isVariant(ref.getBase());
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
RodGenotypeChipAsGFF hc = tracker.lookup("child",RodGenotypeChipAsGFF.class);
|
||||
|
||||
String state;
|
||||
if (hc.getCalledGenotype().isHet()) {
|
||||
state = "het";
|
||||
} else if (hc.getCalledGenotype().isHom()) {
|
||||
state = "homvar";
|
||||
} else {
|
||||
state = "other";
|
||||
}
|
||||
|
||||
int refIndex = ref.getBaseIndex();
|
||||
int altIndex = -1;
|
||||
for (char base : hc.getCalledGenotype().getBases().toCharArray()) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
|
||||
if (baseIndex != refIndex) {
|
||||
altIndex = baseIndex;
|
||||
}
|
||||
}
|
||||
|
||||
int[] baseCounts = context.getPileup().getBaseCounts();
|
||||
double sum = (double) (baseCounts[refIndex] + baseCounts[altIndex]);
|
||||
double refPct = ((double) baseCounts[refIndex])/sum;
|
||||
double altPct = ((double) baseCounts[altIndex])/sum;
|
||||
|
||||
out.printf("%d\t%s\t%c\t%s\t%s\t%d\t%d\t%d\t%f\t%f%n",
|
||||
item++,
|
||||
context.getLocation(),
|
||||
ref.getBase(),
|
||||
hc.getCalledGenotype().getBases(),
|
||||
state,
|
||||
context.getPileup().getReads().size(),
|
||||
baseCounts[refIndex],
|
||||
baseCounts[altIndex], refPct, altPct);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations.
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
return null; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return accumulator with result of the map taken into account.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return null; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
}
|
||||
|
|
@ -1,212 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.varianteval;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.report.tags.DataPoint;
|
||||
import org.broadinstitute.sting.utils.report.utils.TableType;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
@Analysis(name = "Allele Frequency Comparison", description = "Compare allele frequency and counts between eval and comp")
|
||||
public class AlleleFrequencyComparison extends VariantEvaluator {
|
||||
private static int MAX_AC_COUNT = 100; // todo -- command line argument?
|
||||
|
||||
@DataPoint(description="Counts of eval frequency versus comp frequency")
|
||||
AFTable afTable = new AFTable();
|
||||
|
||||
@DataPoint(description="Counts of eval AC versus comp AC")
|
||||
ACTable acTable = new ACTable(MAX_AC_COUNT);
|
||||
|
||||
public boolean enabled() { return true; }
|
||||
|
||||
public int getComparisonOrder() { return 2; }
|
||||
|
||||
public String getName() { return "Allele Frequency Comparison"; }
|
||||
|
||||
public AlleleFrequencyComparison(VariantEvalWalker parent) {
|
||||
//super(parent);
|
||||
}
|
||||
|
||||
//public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) {
|
||||
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( ! (isValidVC(eval) && isValidVC(comp)) ) {
|
||||
return null;
|
||||
} else {
|
||||
// todo -- this is a godawful hack. The "right way" isn't working, so do it the unsafe way for now. Note that
|
||||
// todo -- this precludes getting the AC/AF values from the info field because some may not be there...
|
||||
/*if ( missingField(eval) ) {
|
||||
recalculateCounts(eval);
|
||||
}
|
||||
if ( missingField(comp) ) {
|
||||
recalculateCounts(comp);
|
||||
}*/
|
||||
HashMap<String,Object> evalCounts = new HashMap<String,Object>(2);
|
||||
HashMap<String,Object> compCounts = new HashMap<String,Object>(2);
|
||||
|
||||
VariantContextUtils.calculateChromosomeCounts(eval,evalCounts,false);
|
||||
VariantContextUtils.calculateChromosomeCounts(comp,compCounts,false);
|
||||
afTable.update(((List<Double>)evalCounts.get("AF")).get(0),((List<Double>)compCounts.get("AF")).get(0));
|
||||
acTable.update(((List<Integer>)evalCounts.get("AC")).get(0),((List<Integer>)compCounts.get("AC")).get(0));
|
||||
}
|
||||
|
||||
return null; // there is nothing interesting
|
||||
}
|
||||
|
||||
private static boolean missingField(final VariantContext vc) {
|
||||
return ! ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) );
|
||||
}
|
||||
|
||||
private void recalculateCounts(VariantContext vc) {
|
||||
Map<String,Object> attributes = new HashMap<String,Object>();
|
||||
VariantContextUtils.calculateChromosomeCounts(vc,attributes,false);
|
||||
vc = VariantContext.modifyAttributes(vc,attributes);
|
||||
//getLogger().debug(String.format("%s %s | %s %s",attributes.get("AC"),attributes.get("AF"),vc.getAttribute("AC"),vc.getAttribute("AF")));
|
||||
if ( attributes.size() == 2 && missingField(vc) ) {
|
||||
throw new org.broadinstitute.sting.utils.exceptions.StingException("VariantContext should have had attributes modified but did not");
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isValidVC(final VariantContext vc) {
|
||||
return (vc != null && !vc.isFiltered() && vc.getAlternateAlleles().size() == 1);
|
||||
}
|
||||
|
||||
private static double getAF(VariantContext vc) {
|
||||
Object af = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY);
|
||||
if ( af == null ) {
|
||||
//throw new UserException("Variant context "+vc.getName()+" does not have allele frequency entry which is required for this walker");
|
||||
// still none after being re-computed; this is 0.00
|
||||
return 0.00;
|
||||
} else if ( List.class.isAssignableFrom(af.getClass())) {
|
||||
return ( (List<Double>) af ).get(0);
|
||||
} else if ( String.class.isAssignableFrom(af.getClass())) {
|
||||
// two possibilities
|
||||
String s = (String) af;
|
||||
try {
|
||||
if ( s.startsWith("[") ) {
|
||||
return Double.parseDouble(s.replace("\\[","").replace("\\]",""));
|
||||
} else {
|
||||
return Double.parseDouble(s);
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
throw new UserException("Allele frequency field may be improperly formatted, found AF="+s,e);
|
||||
}
|
||||
} else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) {
|
||||
return (Double) af;
|
||||
} else {
|
||||
throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",af.toString(),af.getClass()));
|
||||
}
|
||||
}
|
||||
|
||||
private static int getAC(VariantContext vc) {
|
||||
Object ac = vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY);
|
||||
if ( ac == null ) {
|
||||
// still none after being re computed; this is 0
|
||||
return 0;
|
||||
} else if ( List.class.isAssignableFrom(ac.getClass())) {
|
||||
return ( (List<Integer>) ac ).get(0);
|
||||
} else if ( String.class.isAssignableFrom(ac.getClass())) {
|
||||
// two possibilities
|
||||
String s = (String) ac;
|
||||
try {
|
||||
if ( s.startsWith("[") ) {
|
||||
return Integer.parseInt(s.replace("\\[","").replace("\\]",""));
|
||||
} else {
|
||||
return Integer.parseInt(s);
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
throw new UserException(String.format("Allele count field may be improperly formatted, found AC=%s for record %s:%d",ac,vc.getChr(),vc.getStart()),e);
|
||||
}
|
||||
} else if ( Integer.class.isAssignableFrom(ac.getClass())) {
|
||||
return (Integer) ac;
|
||||
} else {
|
||||
throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",ac.toString(),ac.getClass()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class AFTable implements TableType {
|
||||
|
||||
protected int[][] afCounts = new int[101][101];
|
||||
|
||||
public Object[] getRowKeys() {
|
||||
String[] afKeys = new String[101];
|
||||
for ( int f = 0; f < 101; f ++ ) {
|
||||
afKeys[f] = String.format("%.2f",(f+0.0)/100.0);
|
||||
}
|
||||
|
||||
return afKeys;
|
||||
}
|
||||
|
||||
public Object[] getColumnKeys() {
|
||||
return getRowKeys(); // nice thing about symmetric tables
|
||||
}
|
||||
|
||||
public Object getCell(int i, int j) {
|
||||
return afCounts[i][j];
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "Allele Frequency Concordance";
|
||||
}
|
||||
|
||||
public void update(double eval, double comp) {
|
||||
afCounts[af2index(eval)][af2index(comp)]++;
|
||||
}
|
||||
|
||||
private int af2index(double d) {
|
||||
return (int) Math.round(100*d);
|
||||
}
|
||||
}
|
||||
|
||||
class ACTable implements TableType {
|
||||
protected int[][] acCounts;
|
||||
protected int maxAC;
|
||||
|
||||
public ACTable(int acMaximum) {
|
||||
maxAC = acMaximum;
|
||||
acCounts = new int[acMaximum+1][acMaximum+1];
|
||||
}
|
||||
|
||||
public Object[] getRowKeys() {
|
||||
String[] acKeys = new String[maxAC+1];
|
||||
for ( int i = 0 ; i <= maxAC ; i ++ ) {
|
||||
acKeys[i] = String.format("%d",i);
|
||||
}
|
||||
|
||||
return acKeys;
|
||||
}
|
||||
|
||||
public Object[] getColumnKeys() {
|
||||
return getRowKeys();
|
||||
}
|
||||
|
||||
public Object getCell(int i, int j) {
|
||||
return acCounts[i][j];
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "Allele Counts Concordance";
|
||||
}
|
||||
|
||||
public void update(int eval, int comp) {
|
||||
eval = eval > maxAC ? maxAC : eval;
|
||||
comp = comp > maxAC ? maxAC : comp;
|
||||
|
||||
acCounts[eval][comp]++;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,219 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.varianteval;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.tags.DataPoint;
|
||||
import org.broadinstitute.sting.utils.report.utils.TableType;
|
||||
import org.broadinstitute.sting.utils.analysis.AminoAcid;
|
||||
import org.broadinstitute.sting.utils.analysis.AminoAcidTable;
|
||||
import org.broadinstitute.sting.utils.analysis.AminoAcidUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @author chartl
|
||||
* @since June 28, 2010
|
||||
*/
|
||||
|
||||
@Analysis(name = "Amino Acid Transition", description = "Calculates the Transition Matrix for coding variants; entries are Total, Num. Ti, Num. Tv, Ratio")
|
||||
public class AminoAcidTransition extends VariantEvaluator {
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
//// INTERNAL DATA POINT CLASSES
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
// a mapping from amino acid transition score histogram bin to Ti/Tv ratio
|
||||
@DataPoint(description = "TiTv counts by amino acid change")
|
||||
AminoAcidTiTvTable acidTable = null;
|
||||
|
||||
class TiTvCount {
|
||||
public int ti;
|
||||
public int tv;
|
||||
|
||||
public TiTvCount() {
|
||||
ti = 0;
|
||||
tv = 0;
|
||||
}
|
||||
|
||||
public int getTotal() {
|
||||
return ti + tv;
|
||||
}
|
||||
|
||||
public double getRatio() {
|
||||
return ( (double) ti )/(1.0+tv);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%d:%d:%d:%.2f",getTotal(),ti,tv,getRatio());
|
||||
}
|
||||
}
|
||||
|
||||
class AminoAcidTiTvTable implements TableType {
|
||||
|
||||
private TiTvCount[][] countsByAAChange;
|
||||
|
||||
public AminoAcidTiTvTable() {
|
||||
countsByAAChange = new TiTvCount[AminoAcid.values().length][AminoAcid.values().length];
|
||||
for ( int i = 0; i < AminoAcid.values().length; i ++ ) {
|
||||
for ( int j = 0; j < AminoAcid.values().length; j++ ) {
|
||||
countsByAAChange[i][j] = new TiTvCount();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Object[] getRowKeys() {
|
||||
return AminoAcidUtils.getAminoAcidCodes();
|
||||
|
||||
}
|
||||
|
||||
public Object[] getColumnKeys() {
|
||||
return AminoAcidUtils.getAminoAcidCodes();
|
||||
}
|
||||
|
||||
public TiTvCount getCell(int x, int y) {
|
||||
return countsByAAChange[x][y];
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "AminoAcidTransitionTable";
|
||||
}
|
||||
|
||||
public void update(AminoAcid reference, AminoAcid alternate, boolean isTransition) {
|
||||
TiTvCount counter = countsByAAChange[reference.ordinal()][alternate.ordinal()];
|
||||
if ( isTransition ) {
|
||||
counter.ti++;
|
||||
} else {
|
||||
counter.tv++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
//// CORE VARIANT EVALUATOR DATA AND METHODS
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
private String infoKey;
|
||||
private String infoValueSplit;
|
||||
private boolean useCodons;
|
||||
private boolean enabled;
|
||||
private AminoAcidTable lookup;
|
||||
|
||||
public AminoAcidTransition(VariantEvalWalker parent) {
|
||||
//super(parent);
|
||||
//enabled = parent.aminoAcidTransitionKey != null;
|
||||
enabled = true;
|
||||
if ( enabled ) {
|
||||
getParsingInformation(parent);
|
||||
lookup = new AminoAcidTable();
|
||||
acidTable = new AminoAcidTiTvTable();
|
||||
}
|
||||
}
|
||||
|
||||
private void getParsingInformation(VariantEvalWalker parent) {
|
||||
if ( enabled() ) {
|
||||
// infoKey = parent.aminoAcidTransitionKey;
|
||||
// infoValueSplit = parent.aminoAcidTransitionSplit;
|
||||
// useCodons = parent.aatUseCodons;
|
||||
|
||||
infoKey = null;
|
||||
infoValueSplit = null;
|
||||
useCodons = false;
|
||||
|
||||
if ( infoKey == null ) {
|
||||
throw new UserException.CommandLineException("No info-field key provided for amino acid tabulation. Please provide the appropriate key with -aatk.");
|
||||
}
|
||||
|
||||
if ( infoValueSplit == null ) {
|
||||
throw new UserException.CommandLineException("No split string provided for amino acid tabulation. Please provide the split string with -aats");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "AminoAcidTransitionTable";
|
||||
}
|
||||
|
||||
public int getComparisonOrder() {
|
||||
return 1; // we only need to see each eval track
|
||||
}
|
||||
|
||||
public boolean enabled() {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getName();
|
||||
}
|
||||
|
||||
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
String interesting = null;
|
||||
//if ( eval != null && eval.hasAttribute(infoKey) ) {
|
||||
if ( enabled && eval != null && eval.hasAttribute(infoKey) ) {
|
||||
String[] parsedNames = ( (String) eval.getAttribute(infoKey)).split(infoValueSplit);
|
||||
String first = "none";
|
||||
String second = "none";
|
||||
try {
|
||||
first = parsedNames [0];
|
||||
second = parsedNames [1];
|
||||
} catch (ArrayIndexOutOfBoundsException e) {
|
||||
//getLogger().warn("Error parsing variant context with value "+eval.getAttribute(infoKey));
|
||||
}
|
||||
AminoAcid reference;
|
||||
AminoAcid alternate;
|
||||
if ( useCodons ) {
|
||||
reference = lookup.getEukaryoticAA(first);
|
||||
alternate = lookup.getEukaryoticAA(second);
|
||||
} else {
|
||||
reference = lookup.getAminoAcidByCode(first);
|
||||
alternate = lookup.getAminoAcidByCode(second);
|
||||
}
|
||||
|
||||
//veWalker.getLogger().info(String.format("%s\t%s\t%s\t%s",first,second,reference,alternate));
|
||||
|
||||
if ( reference == null ) {
|
||||
interesting = "Unknown Reference Codon";
|
||||
} else if ( alternate == null ) {
|
||||
interesting = "Unknown Alternate Codon";
|
||||
} else {
|
||||
acidTable.update(reference,alternate, VariantContextUtils.isTransition(eval));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return interesting; // This module doesn't capture any interesting sites, so return null
|
||||
}
|
||||
|
||||
//public void finalizeEvaluation() {
|
||||
//
|
||||
//}
|
||||
}
|
||||
|
|
@ -1,518 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Oct 12, 2009
|
||||
* Time: 2:43:06 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
@By(DataSource.REFERENCE)
|
||||
@Reference(window=@Window(start=-3,stop=3))
|
||||
public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Set<BaseTransitionTable>,Set<BaseTransitionTable>> implements TreeReducible<Set<BaseTransitionTable>> {
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
@Argument(fullName="usePreviousBases", doc="Use previous bases of the reference as part of the calculation, uses the specified number, defaults to 0", required=false)
|
||||
int nPreviousBases = 0;
|
||||
@Argument(fullName="useSecondaryBase",doc="Use the secondary base of a read as part of the calculation", required=false)
|
||||
boolean useSecondaryBase = false;
|
||||
@Argument(fullName="confidentRefThreshold",doc="Set the lod score that defines confidence in ref, defaults to 4", required=false)
|
||||
int confidentRefThreshold = 5;
|
||||
@Argument(fullName="maxNumMismatches",doc="Set the maximum number of mismatches at a locus before choosing not to use it in calculation. Defaults to 1.", required=false)
|
||||
int maxNumMismatches = 1;
|
||||
@Argument(fullName="minMappingQuality", doc ="Set the alignment quality below which to ignore reads; defaults to 30", required = false)
|
||||
int minMappingQuality = 30;
|
||||
@Argument(fullName="minQualityScore", doc = "Set the base quality score below which to ignore bases in the pileup, defaults to 20", required = false)
|
||||
int minQualityScore = 20;
|
||||
@Argument(fullName="usePileupMismatches", doc = "Use the number of mismatches in the pileup as a condition for the table", required=false)
|
||||
boolean usePileupMismatches = false;
|
||||
@Argument(fullName="usePreviousReadBases", doc="Use previous bases of the read as part of the calculation. Will ignore reads if there aren't this many previous bases. Uses the specified number. Defaults to 0", required=false)
|
||||
int nPreviousReadBases = 0;
|
||||
@Argument(fullName="useReadGroup", doc="Use the group number of the read as a condition of the table.", required = false)
|
||||
boolean useReadGroup = false;
|
||||
@Argument(fullName="outputFile", shortName="of", doc="Output to this file rather than standard out. Must be used with -nt.", required = false)
|
||||
String outFilePath = null;
|
||||
@Argument(fullName="forcePreviousReadBasesToMatchRef", doc="Forces previous read bases to match the reference", required = false)
|
||||
boolean readBasesMustMatchRef = false;
|
||||
|
||||
private UnifiedGenotyperEngine ug;
|
||||
// private ReferenceContextWindow refWindow;
|
||||
// private Set<BaseTransitionTable> conditionalTables;
|
||||
private List<Boolean> usePreviousBases;
|
||||
private List<GenomeLoc> previousBaseLoci;
|
||||
|
||||
public void initialize() {
|
||||
if ( nPreviousBases > 3 || ( nPreviousReadBases > 3 && readBasesMustMatchRef ) ) {
|
||||
throw new UserException.CommandLineException("You have opted to use a number of previous bases in excess of 3. In order to do this you must change the reference window size in the walker itself.");
|
||||
}
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
uac.baseModel = BaseMismatchModel.THREE_STATE;
|
||||
uac.ALL_BASES_MODE = true;
|
||||
ug = new UnifiedGenotyperEngine(getToolkit(), uac);
|
||||
// refWindow = new ReferenceContextWindow(nPreviousBases);
|
||||
usePreviousBases = new ArrayList<Boolean>();
|
||||
previousBaseLoci = new ArrayList<GenomeLoc>();
|
||||
|
||||
}
|
||||
|
||||
public Set<BaseTransitionTable> reduceInit() {
|
||||
return new TreeSet<BaseTransitionTable>();
|
||||
}
|
||||
|
||||
public Set<BaseTransitionTable> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
ReadBackedPileup pileup = context.getBasePileup();
|
||||
Set<BaseTransitionTable> newCounts = null;
|
||||
//System.out.println(pileup.getBases());
|
||||
if ( baseIsUsable(tracker, ref, pileup, context) ) {
|
||||
//System.out.println("Pileup will be used");
|
||||
if ( previousLociCanBeUsed(usePreviousBases,previousBaseLoci,context.getLocation()) ) {
|
||||
for ( int r = 0; r < pileup.getReads().size(); r ++ ) {
|
||||
if ( useRead ( pileup.getReads().get(r), pileup.getOffsets().get(r), ref ) ) {
|
||||
newCounts = updateTables( newCounts, pileup.getReads().get(r), pileup.getOffsets().get(r), ref, pileup );
|
||||
}
|
||||
}
|
||||
} else {
|
||||
updatePreviousBases(usePreviousBases,true,previousBaseLoci,context.getLocation() );
|
||||
}
|
||||
} else {
|
||||
updatePreviousBases( usePreviousBases,false,previousBaseLoci,context.getLocation() );
|
||||
}
|
||||
|
||||
return newCounts;
|
||||
}
|
||||
|
||||
public Set<BaseTransitionTable> reduce ( Set<BaseTransitionTable> map, Set<BaseTransitionTable> reduce ) {
|
||||
if ( map != null && ! map.isEmpty() ) {
|
||||
for ( BaseTransitionTable t : map ) {
|
||||
boolean add = true;
|
||||
for ( BaseTransitionTable r : reduce ) {
|
||||
if ( r.conditionsMatch(t) ) {
|
||||
r.incorporateTable(t);
|
||||
add = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( add ) {
|
||||
reduce.add(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
// System.out.println("Reduce: size of TransitionTable set is " + reduce.size() + " -- size of Map: " + (map != null ? map.size() : "null"));
|
||||
return reduce;
|
||||
}
|
||||
|
||||
public Set<BaseTransitionTable> treeReduce( Set<BaseTransitionTable> reduce1, Set<BaseTransitionTable> reduce2 ) {
|
||||
// check to see if this is a truly tree-reducable calculation
|
||||
if ( nPreviousBases >= 1 ) {
|
||||
String errMsg = "Parallelization cannot be used with UsePreviousBases due to the fact that internal walker data specifies whether a previous reference base is usable or not.";
|
||||
String errMsg2 = " This can cause cause concurrency issues and unpredictable behavior when used with parallelization. Either do not specify -nt, or try a the conjunction of ";
|
||||
String errMsg3 = "--usePreviousReadBases and --forcePreviousReadBasesToMatchRef.";
|
||||
throw new UserException.CommandLineException(errMsg+errMsg2+errMsg3);
|
||||
}
|
||||
return reduce(reduce1,reduce2);
|
||||
}
|
||||
|
||||
public void onTraversalDone( Set<BaseTransitionTable> conditionalTables ) {
|
||||
PrintStream output;
|
||||
if ( outFilePath == null ) {
|
||||
output = out;
|
||||
} else {
|
||||
try {
|
||||
output = new PrintStream(outFilePath);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(new File(outFilePath), e);
|
||||
}
|
||||
}
|
||||
output.print(createHeaderFromConditions());
|
||||
for ( BaseTransitionTable t : conditionalTables )
|
||||
t.print(output);
|
||||
}
|
||||
|
||||
public void updatePreviousBases(List<Boolean> usage, boolean canUse, List<GenomeLoc> loci, GenomeLoc locus) {
|
||||
// early return
|
||||
if ( nPreviousBases < 1 ) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ( usage.size() <= nPreviousBases ) {
|
||||
usage.add(canUse);
|
||||
loci.add(locus);
|
||||
} else {
|
||||
usage.remove(0);
|
||||
usage.add(canUse);
|
||||
loci.remove(0);
|
||||
loci.add(locus);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean previousLociCanBeUsed( List<Boolean> canUse, List<GenomeLoc> loci, GenomeLoc locus ) {
|
||||
if ( nPreviousBases < 1 ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean use = true;
|
||||
for ( boolean b : canUse ) {
|
||||
use = use && b;
|
||||
}
|
||||
|
||||
if ( use ) {
|
||||
use = use && ( loci.get(0).distance(locus) == 1 ); // truly is PREVIOUS base
|
||||
}
|
||||
|
||||
return use;
|
||||
}
|
||||
|
||||
public Set<BaseTransitionTable> updateTables ( Set<BaseTransitionTable> tables, SAMRecord read, int offset, ReferenceContext ref, ReadBackedPileup pileup ) {
|
||||
List<Comparable> readConditions = buildConditions(read,offset,ref, pileup);
|
||||
// System.out.println("Updating table with pileup: "+pileup.getBases()+ ( read.getReadNegativeStrandFlag() ? "-" : "+" ) + " Quality: "+read.getBaseQualities()[offset] + " MapQ: "+read.getMappingQuality());
|
||||
|
||||
if ( tables == null ) {
|
||||
tables = new TreeSet<BaseTransitionTable>();
|
||||
}
|
||||
|
||||
boolean createNewTable = true;
|
||||
|
||||
for ( BaseTransitionTable t : tables ) {
|
||||
if ( t.conditionsMatch(readConditions) ) {
|
||||
updateTable(t,read,offset,ref);
|
||||
createNewTable = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( createNewTable ) {
|
||||
BaseTransitionTable t = new BaseTransitionTable(readConditions);
|
||||
updateTable(t,read,offset,ref);
|
||||
tables.add(t);
|
||||
}
|
||||
|
||||
return tables;
|
||||
}
|
||||
|
||||
public void updateTable(BaseTransitionTable t, SAMRecord r, int o, ReferenceContext ref) {
|
||||
// System.out.println("Update Table");
|
||||
if ( r.getReadNegativeStrandFlag() ) {
|
||||
t.update((byte)BaseUtils.simpleComplement((char) r.getReadBases()[o]), (byte)BaseUtils.simpleComplement(ref.getBaseAsChar()));
|
||||
} else {
|
||||
t.update(r.getReadBases()[o], ref.getBase());
|
||||
}
|
||||
}
|
||||
|
||||
public boolean useRead( SAMRecord read, int offset, ReferenceContext ref ) {
|
||||
|
||||
if ( Character.toUpperCase(read.getReadBases()[offset]) == Character.toUpperCase(ref.getBase()) ) {
|
||||
return false;
|
||||
} else if ( read.getMappingQuality() <= minMappingQuality ) {
|
||||
return false;
|
||||
} else if ( ! BaseUtils.isRegularBase( (char) read.getReadBases()[offset]) ) {
|
||||
return false;
|
||||
} else if ( read.getBaseQualities()[offset] <= minQualityScore ) {
|
||||
return false;
|
||||
} else if ( useSecondaryBase && read.getAttribute("SQ") == null ) {
|
||||
return false;
|
||||
} else if ( nPreviousBases >= 1 && previousReadBasesMismatchRef(read, offset, ref) ) {
|
||||
return false;
|
||||
} else if ( nPreviousReadBases >= 1 && readLacksPreviousBases(read,offset,nPreviousReadBases) ) {
|
||||
return false;
|
||||
} else if ( nPreviousReadBases >= 1 && readBasesMustMatchRef && previousReadBasesMismatchRef(read, offset, ref) ) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean previousReadBasesMismatchRef( SAMRecord read, int offset, ReferenceContext ref ) {
|
||||
int c = read.getReadNegativeStrandFlag() ? 1 : -1;
|
||||
if ( offset + nPreviousBases*c < 0 ) {
|
||||
return true;
|
||||
} else if ( offset + nPreviousBases*c > read.getReadLength() ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for ( int prevBase = 1; prevBase <= nPreviousBases; prevBase ++ ) {
|
||||
if ( Character.toUpperCase(read.getReadBases()[offset + prevBase*c]) != Character.toUpperCase(ref.getBases()[nPreviousBases+1+prevBase*c]) || ! BaseUtils.isRegularBase(ref.getBases()[nPreviousBases+1+prevBase*c])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean readLacksPreviousBases( SAMRecord read, int offset, int prevBases ) {
|
||||
if ( ! read.getReadNegativeStrandFlag() ) {
|
||||
return offset - prevBases < 0;
|
||||
} else {
|
||||
return offset + prevBases + 1 >= read.getReadLength();
|
||||
}
|
||||
}
|
||||
|
||||
public List<Comparable> buildConditions( SAMRecord read, int offset, ReferenceContext ref, ReadBackedPileup pileup ) {
|
||||
ArrayList<Comparable> conditions = new ArrayList<Comparable>();
|
||||
|
||||
if ( nPreviousBases > 0 ) {
|
||||
conditions.add(buildRefString(ref,nPreviousBases, ! read.getReadNegativeStrandFlag()));
|
||||
|
||||
}
|
||||
|
||||
if ( useSecondaryBase ) {
|
||||
conditions.add(getSecondaryBase(read,offset));
|
||||
}
|
||||
|
||||
if ( nPreviousReadBases > 0 ) {
|
||||
conditions.add(buildReadString(read, offset, nPreviousReadBases));
|
||||
}
|
||||
|
||||
if ( usePileupMismatches ) {
|
||||
conditions.add(countMismatches(ref.getBase(), pileup));
|
||||
}
|
||||
|
||||
if ( useReadGroup ) {
|
||||
conditions.add(read.getReadGroup().getReadGroupId());
|
||||
}
|
||||
|
||||
return conditions;
|
||||
}
|
||||
|
||||
public String buildRefString(ReferenceContext ref, int bases, boolean forwardRead) {
|
||||
if ( forwardRead ) {
|
||||
return ( new String(ref.getBases()) ).substring(0,nPreviousBases-1);
|
||||
} else {
|
||||
return BaseUtils.simpleReverseComplement( ( new String(ref.getBases()) ).substring(nPreviousBases+1) );
|
||||
}
|
||||
}
|
||||
|
||||
public String buildReadString( SAMRecord read, int offset, int nPreviousReadBases ) {
|
||||
if ( ! read.getReadNegativeStrandFlag() ) {
|
||||
return read.getReadString().substring(offset-nPreviousReadBases,offset);
|
||||
} else {
|
||||
return BaseUtils.simpleReverseComplement( read.getReadString().substring(offset+1,offset+nPreviousReadBases+1) );
|
||||
}
|
||||
}
|
||||
|
||||
public String createHeaderFromConditions() {
|
||||
String header = "Observed_base\tTrue_base";
|
||||
|
||||
if ( nPreviousBases > 0) {
|
||||
header = header+"\tPrevious_"+nPreviousBases+"_bases";
|
||||
}
|
||||
|
||||
if ( useSecondaryBase ) {
|
||||
header = header + "\tSecondary_base";
|
||||
}
|
||||
|
||||
if ( nPreviousReadBases > 0 ) {
|
||||
header = header + "\tPrevious_"+nPreviousReadBases+"_read_bases";
|
||||
}
|
||||
|
||||
if ( usePileupMismatches ) {
|
||||
header = header + "\tNumber_of_pileup_mismatches";
|
||||
}
|
||||
|
||||
if ( useReadGroup ) {
|
||||
header = header + "\tRead_group";
|
||||
}
|
||||
|
||||
return String.format("%s\t%s%n",header,"Counts");
|
||||
}
|
||||
|
||||
public int countMismatches(byte ref, ReadBackedPileup p) {
|
||||
int refM = p.getBaseCounts()[BaseUtils.simpleBaseToBaseIndex(ref)];
|
||||
return p.size()-refM;
|
||||
}
|
||||
|
||||
public char getSecondaryBase ( SAMRecord read, int offset ) {
|
||||
return BaseUtils.baseIndexToSimpleBaseAsChar(QualityUtils.compressedQualityToBaseIndex( ( (byte[]) read.getAttribute("SQ") )[offset] ) );
|
||||
}
|
||||
|
||||
public boolean baseIsUsable ( RefMetaDataTracker tracker, ReferenceContext ref, ReadBackedPileup pileup, AlignmentContext context ) {
|
||||
return pileupContainsNoNs(pileup) && baseIsConfidentRef(tracker,ref,context) && pileupBelowMismatchThreshold(ref,pileup);
|
||||
}
|
||||
|
||||
public boolean pileupBelowMismatchThreshold( ReferenceContext ref, ReadBackedPileup pileup ) {
|
||||
return countMismatches(ref.getBase(), pileup) <= maxNumMismatches;
|
||||
}
|
||||
|
||||
public boolean pileupContainsNoNs(ReadBackedPileup pileup) {
|
||||
for ( byte c : pileup.getBases() ) {
|
||||
if ( c == 'N' ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
if ( !BaseUtils.isRegularBase(ref.getBase()) )
|
||||
return false;
|
||||
VariantCallContext calls = ug.calculateLikelihoodsAndGenotypes(tracker,ref,context);
|
||||
if ( calls == null || calls.vc == null)
|
||||
return false;
|
||||
return ( calls.vc.getNSamples() > 0 && calls.vc.getGenotype(0).isHomRef() && calls.vc.getGenotype(0).getNegLog10PError() > confidentRefThreshold );
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
class BaseTransitionTable implements Comparable {
|
||||
|
||||
/*
|
||||
* no direct manipulation of these objects ever
|
||||
*/
|
||||
private int[][] table;
|
||||
private List<Comparable> conditions;
|
||||
|
||||
public BaseTransitionTable(List<Comparable> conditions) {
|
||||
table = new int[BaseUtils.BASES.length][BaseUtils.BASES.length];
|
||||
for ( int i = 0; i < BaseUtils.BASES.length; i ++ ) {
|
||||
for ( int j = 0; j < BaseUtils.BASES.length; j ++ ) {
|
||||
table[i][j]=0;
|
||||
}
|
||||
}
|
||||
|
||||
this.conditions = conditions;
|
||||
}
|
||||
|
||||
public boolean conditionsMatch(Object obj) {
|
||||
if ( obj == null ) {
|
||||
return false;
|
||||
} else if ( obj instanceof BaseTransitionTable ) {
|
||||
return ((BaseTransitionTable) obj).conditionsMatch(conditions);
|
||||
} else if ( ! (obj instanceof List) ) {
|
||||
|
||||
return false;
|
||||
} else if ( this.numConditions() != ((List)obj).size() ){
|
||||
return false;
|
||||
} else {
|
||||
boolean eq = true;
|
||||
ListIterator thisIter = this.getConditionIterator();
|
||||
ListIterator thatIter = ((List)obj).listIterator();
|
||||
|
||||
while ( thisIter.hasNext() ) {
|
||||
eq = eq && thisIter.next().equals(thatIter.next());
|
||||
}
|
||||
|
||||
return eq;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public int compareTo(Object obj) {
|
||||
if ( ! ( obj instanceof BaseTransitionTable ) ) {
|
||||
return -1;
|
||||
} else {
|
||||
BaseTransitionTable t = (BaseTransitionTable) obj;
|
||||
if ( this.conditionsMatch(t.conditions) ) {
|
||||
return 0;
|
||||
} else {
|
||||
if ( this.numConditions() == t.numConditions() ) {
|
||||
ListIterator<Comparable> thisIter = this.conditions.listIterator();
|
||||
ListIterator<Comparable> thatIter = t.conditions.listIterator();
|
||||
int g = 0;
|
||||
do {
|
||||
g = thisIter.next().compareTo(thatIter.next());
|
||||
} while ( g == 0 );
|
||||
|
||||
return g;
|
||||
|
||||
} else {
|
||||
return (this.numConditions() > t.numConditions() ) ? 1 : -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void print( PrintStream out ) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
for ( byte observedBase : BaseUtils.BASES ) {
|
||||
for ( byte refBase : BaseUtils.BASES ) {
|
||||
s.append(String.format("%s\t%s",(char)observedBase,(char)refBase));
|
||||
for ( Comparable c : conditions ) {
|
||||
s.append(String.format("\t%s",c.toString()));
|
||||
}
|
||||
s.append(String.format("\t%d%n", table[BaseUtils.simpleBaseToBaseIndex(observedBase)][BaseUtils.simpleBaseToBaseIndex(refBase)]));
|
||||
}
|
||||
}
|
||||
|
||||
out.print(s.toString());
|
||||
}
|
||||
|
||||
public void update(byte observedBase, byte refBase ) {
|
||||
//if ( observedBase == refBase ) {
|
||||
// throw new StingException("BaseTransitionTable received equal observed and reference bases, which should not happen.");
|
||||
//}
|
||||
// System.out.println("Table updating: Observed Base: "+observedBase+" Ref base: "+refBase);
|
||||
table[BaseUtils.simpleBaseToBaseIndex(observedBase)][BaseUtils.simpleBaseToBaseIndex(refBase)]++;
|
||||
}
|
||||
|
||||
public int numConditions() {
|
||||
return conditions.size();
|
||||
}
|
||||
|
||||
private Comparable getCondition(int offset) {
|
||||
return conditions.get(offset);
|
||||
}
|
||||
|
||||
private ListIterator getConditionIterator() {
|
||||
return conditions.listIterator();
|
||||
}
|
||||
|
||||
public void incorporateTable(BaseTransitionTable t) {
|
||||
for ( int i = 0; i < BaseUtils.BASES.length; i ++ ) {
|
||||
for ( int j = 0; j < BaseUtils.BASES.length; j ++ ) {
|
||||
table[i][j] += t.observationsOf(i,j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int observationsOf( int observedBaseIndex, int referenceBaseIndex ) {
|
||||
return table[observedBaseIndex][referenceBaseIndex];
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,345 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class ConcordanceTruthTable {
|
||||
public static final int TRUE_POSITIVE = 0;
|
||||
public static final int TRUE_NEGATIVE = 1;
|
||||
public static final int FALSE_POSITIVE = 2;
|
||||
public static final int FALSE_NEGATIVE = 3;
|
||||
public static final int VARIANT = 1;
|
||||
private static final String[] POOL_HEADERS = {"TP","TN","FP","FN"};
|
||||
|
||||
public static final int REF = 0;
|
||||
public static final int VAR_HET = 1;
|
||||
public static final int VAR_HOM = 2;
|
||||
public static final int UNKNOWN = 3;
|
||||
public static final int NO_CALL = 3; // synonym
|
||||
private static final String[] TRUTH_NAMES = {"IS_REF", "IS_VAR_HET", "IS_VAR_HOM", "UNKNOWN"};
|
||||
private static final String[] CALL_NAMES = {"CALLED_REF", "CALLED_VAR_HET", "CALLED_VAR_HOM", "NO_CALL"};
|
||||
|
||||
private String name = null;
|
||||
private boolean singleSampleMode;
|
||||
|
||||
private int[][] table;
|
||||
private int[] truth_totals;
|
||||
private int[] calls_totals;
|
||||
|
||||
|
||||
public ConcordanceTruthTable(String name) {
|
||||
// there's a specific sample associated with this truth table
|
||||
this.name = name;
|
||||
singleSampleMode = true;
|
||||
|
||||
table = new int[4][4];
|
||||
truth_totals = new int[4];
|
||||
calls_totals = new int[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
truth_totals[i] = 0;
|
||||
calls_totals[i] = 0;
|
||||
for (int j = 0; j < 4; j++)
|
||||
table[i][j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public ConcordanceTruthTable(int nSamples) {
|
||||
// there's no specific sample associated with this truth table
|
||||
singleSampleMode = false;
|
||||
name = "pooled_concordance";
|
||||
truth_totals = new int[4];
|
||||
calls_totals = new int[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
truth_totals[i] = 0;
|
||||
calls_totals[i] = 0;
|
||||
}
|
||||
|
||||
initializeFrequencyTable(nSamples);
|
||||
}
|
||||
|
||||
private void initializeFrequencyTable( int numChips ) {
|
||||
// System.out.println("Frequency Table for Pooled Concordance initialized with number of chips = "+numChips);
|
||||
table = new int[numChips*2][4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for ( int freq = 0; freq < 2*numChips; freq ++ ) {
|
||||
table[freq][i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// System.out.println("Table Size: "+table.length+" by "+table[1].length);
|
||||
}
|
||||
|
||||
public String addEntry(List<Pair<Genotype, Genotype>> chipEvals, Variation eval, char ref) {
|
||||
String violation = null;
|
||||
|
||||
// if the table represents a single sample, then we can calculate genotype stats
|
||||
if ( singleSampleMode ) {
|
||||
for ( Pair<Genotype, Genotype> chipEval : chipEvals ) {
|
||||
|
||||
Genotype chipG = chipEval.first;
|
||||
Genotype evalG = chipEval.second;
|
||||
|
||||
if (chipG == null && evalG == null)
|
||||
continue;
|
||||
|
||||
int truthType = getGenotype(chipG, ref);
|
||||
int callType = getGenotype(evalG, ref);
|
||||
|
||||
//System.out.printf("TEST: %d/%d %s vs. %s%n", truthIndex, callIndex, chip, eval);
|
||||
if ( truthType == VARIANT && callType != VARIANT ) {
|
||||
violation = String.format("False negative: ref=%c chip=%s call=%s", ref, chipG, evalG);
|
||||
} else if ( truthType == REF && callType == VARIANT ) {
|
||||
violation = String.format("False positive: chip=%s call=%s", chipG, evalG);
|
||||
}
|
||||
|
||||
addGenotypeEntry(truthType, callType);
|
||||
}
|
||||
} else { // if we cannot associate tables with individuals, then we are working in a pooled context
|
||||
// first we need to expand our tables to include frequency information
|
||||
Pair<Integer, Pair<Integer,Integer> > poolVariant = getPooledAlleleFrequency(chipEvals, ref);
|
||||
|
||||
int truthType = poolVariant.getFirst(); // convenience method; now to interpret
|
||||
int callType = getCallIndex(eval,ref);
|
||||
|
||||
int numTrueSupportingAlleles = poolVariant.getSecond().getFirst();
|
||||
if ( numTrueSupportingAlleles > 0 && truthType == VARIANT && callType != VARIANT ) {
|
||||
violation = String.format("False negative: %s with %d alt alleles", chipEvals.get(0).getFirst(), numTrueSupportingAlleles);
|
||||
} else if ( truthType == REF && callType == VARIANT ) {
|
||||
violation = String.format("False positive: %s at hom-ref site", eval);
|
||||
}
|
||||
|
||||
addFrequencyEntry( truthType, callType, poolVariant.getSecond().getFirst() );
|
||||
|
||||
}
|
||||
|
||||
// TODO -- implement me for pooled mode with frequency stats
|
||||
// TODO -- You'll want to use eval and the chips from chipEvals (these are the first members of the pair)
|
||||
// TODO -- You'll also need to declare (and initialize) the relevant data arrays for the data
|
||||
// TODO -- Indexes like TRUE_POSITIVE are defined above for you
|
||||
return violation;
|
||||
}
|
||||
|
||||
public Pair<Integer, Pair<Integer,Integer>> getPooledAlleleFrequency( List<Pair<Genotype,Genotype>> chips, char ref) {
|
||||
// this is actually just a note that I wanted to appear in blue. This method explicitly uses
|
||||
// the assumption that tri-allelic sites do not really exist, and that if they do the
|
||||
// site will be marked as such by an 'N' in the reference, so we will not get to this point.
|
||||
|
||||
int frequency = 0;
|
||||
int nChips = 0;
|
||||
if ( chips != null ) {
|
||||
for ( Pair<Genotype,Genotype> chip : chips ) {
|
||||
Genotype c = chip.getFirst();
|
||||
if ( c != null ) {
|
||||
nChips++;
|
||||
if ( c.isVariant(ref) ) {
|
||||
if ( c.isHet() ) {
|
||||
frequency++;
|
||||
} else { // c is hom
|
||||
frequency += 2;
|
||||
}
|
||||
}
|
||||
//System.out.printf(" Genotype %s at %c => %d%n", c, ref, frequency);
|
||||
}
|
||||
}
|
||||
//System.out.printf("*** %d%n", frequency);
|
||||
}
|
||||
|
||||
int truthType = nChips > 0 ? ( frequency > 0 ? VARIANT : REF ) : NO_CALL;
|
||||
return new Pair<Integer, Pair<Integer,Integer> >(truthType, new Pair<Integer,Integer>(frequency,nChips));
|
||||
}
|
||||
|
||||
private void addFrequencyEntry( int truthIndex, int callIndex, int numTrueSupportingAlleles ) {
|
||||
//System.out.printf(" %s %s %d%n", CALL_NAMES[truthIndex], CALL_NAMES[callIndex], numTrueSupportingAlleles);
|
||||
calls_totals[callIndex]++;
|
||||
truth_totals[truthIndex]++;
|
||||
|
||||
if ( truthIndex == REF && ( callIndex == REF || callIndex == NO_CALL ) ) {
|
||||
// true negative
|
||||
table[numTrueSupportingAlleles][TRUE_NEGATIVE]++;
|
||||
// sanity check - there should never be an entry in
|
||||
// [*][TRUE_NEGATIVE] for * > 0
|
||||
} else if ( truthIndex == REF && callIndex == VARIANT ) {
|
||||
// false positive
|
||||
table[numTrueSupportingAlleles][FALSE_POSITIVE]++;
|
||||
} else if ( truthIndex == VARIANT && (callIndex == NO_CALL || callIndex == REF) ) {
|
||||
// false negative
|
||||
table[numTrueSupportingAlleles][FALSE_NEGATIVE]++;
|
||||
} else if ( truthIndex == VARIANT && callIndex == VARIANT ) {
|
||||
// true positive
|
||||
table[numTrueSupportingAlleles][TRUE_POSITIVE]++;
|
||||
} else {
|
||||
// something else is going on; wonky site or something. Don't do anything to the table.
|
||||
}
|
||||
}
|
||||
|
||||
private static int getCallIndex(Variation eval, char ref) {
|
||||
int index;
|
||||
|
||||
if ( eval == null ) {
|
||||
index = NO_CALL;
|
||||
} else if ( ! eval.isSNP() ) {
|
||||
index = REF;
|
||||
} else {
|
||||
index = VARIANT;
|
||||
}
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
private static int getGenotype(Genotype g, char ref) {
|
||||
int type;
|
||||
|
||||
if ( g == null )
|
||||
type = NO_CALL;
|
||||
else if ( !g.isVariant(ref) )
|
||||
type = REF;
|
||||
else if ( g.isHet() )
|
||||
type = VAR_HET;
|
||||
else
|
||||
type = VAR_HOM;
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
private void addGenotypeEntry(int truthIndex, int callIndex) {
|
||||
table[truthIndex][callIndex]++;
|
||||
truth_totals[truthIndex]++;
|
||||
calls_totals[callIndex]++;
|
||||
}
|
||||
|
||||
public void addAllStats(List<String> s) {
|
||||
if ( singleSampleMode )
|
||||
addGenotypeStats(s);
|
||||
else
|
||||
addFrequencyStats(s);
|
||||
}
|
||||
|
||||
// private void addFrequencyStats(List<String> s) {
|
||||
//
|
||||
// // TODO -- implement me for pooled mode with frequency stats
|
||||
// s.add(String.format("name %s",name));
|
||||
// s.add(String.format("TRUTH_ALLELE_FREQUENCY\tERROR_OR_TRUTH_TYPE\tTOTAL\tAS_PRCT_OF_TOTAL_CALLS\tAS_PRCT_OF_CALLS_AT_FREQUENCY"));
|
||||
//
|
||||
// for ( int af = 0; af < table.length; af ++ ) {
|
||||
// for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) {
|
||||
// StringBuffer sb = new StringBuffer();
|
||||
// sb.append(String.format("%f ", ((double) af)/ table.length));
|
||||
// sb.append(String.format("%s ",POOL_HEADERS[errorIndex]));
|
||||
// sb.append(String.format("%d ", table[af][errorIndex]));
|
||||
// sb.append(String.format("%s ", percentOfTotal(table,af,errorIndex)));
|
||||
// sb.append(String.format("%s ", marginalPercent(table[af],errorIndex)));
|
||||
// s.add(sb.toString());
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// }
|
||||
|
||||
private void addFrequencyStats(List<String> s) {
|
||||
s.add(String.format("name %s",name));
|
||||
s.add("TRUTH_ALLELE_COUNT\tTRUTH_ALLELE_FREQ\tTOTAL\t" + Utils.join(" ", POOL_HEADERS));
|
||||
|
||||
for ( int af = 0; af < table.length; af ++ ) {
|
||||
int sum = 0;
|
||||
String counts = "";
|
||||
for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) {
|
||||
int count = table[af][errorIndex];
|
||||
sum += count;
|
||||
counts += String.format(" %6d", count);
|
||||
}
|
||||
s.add(String.format("%6d %.3f %6d%s", af, ((double)af)/ table.length, sum, counts));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void addGenotypeStats(List<String> s) {
|
||||
s.add(String.format("name %s", name));
|
||||
s.add(String.format("TRUTH_STATE\tCALLED_REF\tCALLED_VAR_HET\tCALLED_VAR_HOM\tNO_CALL\t\tTOTALS\tTRUE_GENOTYPE_CONCORDANCE\tGENOTYPE_SENSITIVITY"));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(String.format("%15s ", TRUTH_NAMES[i]));
|
||||
for (int j = 0; j < 4; j++)
|
||||
sb.append(String.format("%9d ", table[i][j]));
|
||||
sb.append(String.format("%9d ", truth_totals[i]));
|
||||
if (i == VAR_HET || i == VAR_HOM) {
|
||||
sb.append(String.format("\t%s\t\t", cellPercent(table[i][i], table[i][REF] + table[i][VAR_HET] + table[i][VAR_HOM])));
|
||||
sb.append(String.format("%s", cellPercent(truth_totals[i] - table[i][NO_CALL], truth_totals[i])));
|
||||
} else {
|
||||
sb.append("\tN/A\t\t\tN/A");
|
||||
}
|
||||
s.add(sb.toString());
|
||||
}
|
||||
|
||||
addCalledGenotypeConcordance(s);
|
||||
addOverallStats(s);
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
s.add(String.format("%s_%s_%s %d", TRUTH_NAMES[i], CALL_NAMES[j], "NO_SITES", table[i][j]));
|
||||
s.add(String.format("%s_%s_%s %s", TRUTH_NAMES[i], CALL_NAMES[j], "PERCENT_OF_TRUTH", cellPercent(table[i][j], truth_totals[i])));
|
||||
s.add(String.format("%s_%s_%s %s", TRUTH_NAMES[i], CALL_NAMES[j], "PERCENT_OF_CALLS", cellPercent(table[i][j], calls_totals[j])));
|
||||
}
|
||||
if (i == VAR_HET || i == VAR_HOM) {
|
||||
s.add(String.format("%s_%s %s", TRUTH_NAMES[i], "TRUE_GENOTYPE_CONCORDANCE", cellPercent(table[i][i], table[i][REF] + table[i][VAR_HET] + table[i][VAR_HOM])));
|
||||
s.add(String.format("%s_%s %s", TRUTH_NAMES[i], "GENOTYPE_SENSITIVITY", cellPercent(truth_totals[i] - table[i][NO_CALL], truth_totals[i])));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addCalledGenotypeConcordance(List<String> s) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("CALLED_GENOTYPE_CONCORDANCE\t");
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int nConcordantCallsI = table[i][i];
|
||||
String value = "N/A";
|
||||
if (i != UNKNOWN)
|
||||
value = String.format("%s\t", cellPercent(nConcordantCallsI, calls_totals[i] - table[UNKNOWN][i]));
|
||||
sb.append(value);
|
||||
}
|
||||
s.add(sb.toString());
|
||||
}
|
||||
|
||||
// How many overall calls where made that aren't NO_CALLS or UNKNOWNS?
|
||||
private int getNCalled() {
|
||||
int n = 0;
|
||||
for (int i = 0; i < 4; i++)
|
||||
for (int j = 0; j < 4; j++)
|
||||
if (i != NO_CALL && j != NO_CALL) n += table[i][j];
|
||||
return n;
|
||||
}
|
||||
|
||||
private void addOverallStats(List<String> s) {
|
||||
int nConcordantRefCalls = table[REF][REF];
|
||||
int nConcordantHetCalls = table[VAR_HET][VAR_HET];
|
||||
int nConcordantVarHomCalls = table[VAR_HOM][VAR_HOM];
|
||||
int nVarCalls = table[VAR_HOM][VAR_HET] + table[VAR_HOM][VAR_HOM] + table[VAR_HET][VAR_HET] + table[VAR_HET][VAR_HOM];
|
||||
int nConcordantVarCalls = nConcordantHetCalls + nConcordantVarHomCalls;
|
||||
int nConcordantCalls = nConcordantRefCalls + nConcordantVarCalls;
|
||||
int nTrueVar = truth_totals[VAR_HET] + truth_totals[VAR_HOM];
|
||||
int nCalled = getNCalled();
|
||||
s.add(String.format("VARIANT_SENSITIVITY %s", cellPercent(nVarCalls, nTrueVar)));
|
||||
s.add(String.format("VARIANT_CONCORDANCE %s", cellPercent(nConcordantVarCalls, nVarCalls)));
|
||||
s.add(String.format("OVERALL_CONCORDANCE %s", cellPercent(nConcordantCalls, nCalled)));
|
||||
}
|
||||
|
||||
private static String cellPercent(int count, int total) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
total = Math.max(total, 0);
|
||||
sb.append(String.format("%.2f", (100.0 * count) / total));
|
||||
sb.append("%");
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,167 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: asivache
|
||||
* Date: Dec 3, 2009
|
||||
* Time: 11:54:35 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
|
||||
public class DSBWalker extends LocusWalker<Integer,Integer> {
|
||||
@Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true)
|
||||
int COV_CUTOFF = 0;
|
||||
@Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false)
|
||||
int MINLENGTH_CUTOFF = 0;
|
||||
|
||||
private int MERGE_DIST = 300; // merge intervals that are closer than this distance from one another
|
||||
|
||||
private long maxcov = 0;
|
||||
private long maxz = 0;
|
||||
private long mergedmaxcov = 0;
|
||||
private long mergedmaxz = 0;
|
||||
GenomeLoc mergedInterval = null;
|
||||
GenomeLoc currentInterval = null;
|
||||
|
||||
private long nIntervals = 0;
|
||||
|
||||
private void emit(GenomeLoc l) {
|
||||
if ( mergedInterval == null ) {
|
||||
mergedInterval = l.clone();
|
||||
mergedmaxcov = maxcov;
|
||||
mergedmaxz = maxz;
|
||||
return;
|
||||
}
|
||||
|
||||
if ( mergedInterval.getContigIndex() != l.getContigIndex() ) {
|
||||
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
|
||||
if ( length >= MINLENGTH_CUTOFF ) {
|
||||
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
|
||||
nIntervals++;
|
||||
}
|
||||
mergedInterval = l.clone();
|
||||
mergedmaxcov = maxcov;
|
||||
mergedmaxz = maxz;
|
||||
return;
|
||||
}
|
||||
|
||||
// merged interval exists and new interval is on the same contig. Check if the new interval
|
||||
// is close enough so we got to merge and keep waiting:
|
||||
|
||||
if ( l.getStart() - mergedInterval.getStop() < MERGE_DIST ) {
|
||||
mergedInterval = GenomeLocParser.setStop(mergedInterval,l.getStop());
|
||||
if ( maxcov > mergedmaxcov) mergedmaxcov = maxcov;
|
||||
if ( maxz > mergedmaxz ) mergedmaxz = maxz;
|
||||
return;
|
||||
}
|
||||
|
||||
// nope, new interval is far enough. Print old one and keep current one.
|
||||
|
||||
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
|
||||
if ( length >= MINLENGTH_CUTOFF ) {
|
||||
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
|
||||
nIntervals++;
|
||||
}
|
||||
mergedInterval = l.clone();
|
||||
mergedmaxcov = maxcov;
|
||||
mergedmaxz = maxz;
|
||||
|
||||
}
|
||||
|
||||
public void onTraversalDone() {
|
||||
if ( mergedInterval != null ) {
|
||||
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
|
||||
if ( length >= MINLENGTH_CUTOFF ) {
|
||||
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
|
||||
nIntervals++;
|
||||
}
|
||||
}
|
||||
System.out.println(nIntervals+" intervals detected.");
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
ReadBackedPileup pileup = context.getPileup();
|
||||
List<SAMRecord> reads = pileup.getReads();
|
||||
|
||||
int nZero = pileup.getNumberOfMappingQualityZeroReads();
|
||||
|
||||
int nonZCoverage = reads.size() - nZero;
|
||||
|
||||
if ( nonZCoverage >= COV_CUTOFF ) {
|
||||
|
||||
// if we were not inside an interval, start one:
|
||||
if ( currentInterval == null ) {
|
||||
maxcov = nonZCoverage;
|
||||
maxz = nZero;
|
||||
currentInterval = context.getLocation().clone();
|
||||
// System.out.println("Setting current to "+currentInterval);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// if we were inside an interval and we just jumped onto a new contig, get rid of the old interval
|
||||
if ( currentInterval.compareContigs(context.getLocation()) != 0 ) {
|
||||
// we just moved to a new contig
|
||||
System.out.println("On contig "+context.getLocation().getContig());
|
||||
emit(currentInterval);
|
||||
maxcov = nonZCoverage;
|
||||
maxz = nZero;
|
||||
currentInterval = context.getLocation().clone();
|
||||
return 0;
|
||||
}
|
||||
|
||||
// we are on the same contig, we are within the interval, so we need to extend the current interval:
|
||||
currentInterval = GenomeLocParser.setStop(currentInterval,context.getLocation().getStop()); // still within the interval, adjust stop
|
||||
//System.out.println("Extending current to "+currentInterval +" ("+context.getLocation()+", "+context.getLocation().getStop()+")");
|
||||
if ( nonZCoverage > maxcov ) maxcov = nonZCoverage; // adjust maxcov
|
||||
if ( nZero > maxz ) maxz = nZero; // adjust maxz
|
||||
} else {
|
||||
// low coverage, if we were inside an interval, it stops now:
|
||||
if ( currentInterval != null ) {
|
||||
// System.out.println("Emitting current as "+currentInterval);
|
||||
emit(currentInterval);
|
||||
currentInterval = null;
|
||||
maxcov = 0;
|
||||
maxz = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations.
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
return 0; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return accumulator with result of the map taken into account.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum+value; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
}
|
||||
|
|
@ -1,360 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.CircularArray;
|
||||
import org.broadinstitute.sting.utils.PrimitivePair;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: asivache
|
||||
* Date: Dec 12, 2009
|
||||
* Time: 2:25:44 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
|
||||
public class DSBWalkerV2 extends LocusWalker<Integer,Integer> {
|
||||
// @Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true)
|
||||
// int COV_CUTOFF = 0;
|
||||
// @Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false)
|
||||
// int MINLENGTH_CUTOFF = 0;
|
||||
@Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true)
|
||||
int WINDOW_SIZE = 100;
|
||||
@Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true)
|
||||
double ENRICHMENT_CUTOFF = 5.0;
|
||||
@Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+
|
||||
"(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+
|
||||
"so that control*enrichmentCutoff is too low to be convincing)",required=true)
|
||||
int MIN_SIGNAL = 10;
|
||||
|
||||
private CircularArray<PrimitivePair.Int> signalWindow = null;
|
||||
private CircularArray<PrimitivePair.Int> controlWindow = null;
|
||||
private CircularArray<PrimitivePair.Int> signalStrandsWindow = null;
|
||||
private CircularArray<PrimitivePair.Int> controlStrandsWindow = null;
|
||||
|
||||
private PrimitivePair.Long totalSignalCoverage = new PrimitivePair.Long();
|
||||
private PrimitivePair.Long totalControlCoverage = new PrimitivePair.Long();
|
||||
private PrimitivePair.Long totalSignalFwdStrands = new PrimitivePair.Long();
|
||||
private PrimitivePair.Long totalControlFwdStrands = new PrimitivePair.Long();
|
||||
|
||||
private Set<String> signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able
|
||||
private Set<String> controlReadGroups ; // to properly assign the reads coming from a merged stream
|
||||
|
||||
private long windowStart = -1;
|
||||
private long windowStop = -1;
|
||||
private int curContig = -1;
|
||||
private String curContigName = "";
|
||||
|
||||
// the following variables are for buffering and merging windows :
|
||||
private long regionStart = -1;
|
||||
private long lastWindowStart = -1;
|
||||
private PrimitivePair.Int maxSignalReads = new PrimitivePair.Int();
|
||||
private PrimitivePair.Int minSignalReads = new PrimitivePair.Int();
|
||||
private PrimitivePair.Int maxControlReads = new PrimitivePair.Int();
|
||||
private PrimitivePair.Int minControlReads = new PrimitivePair.Int();
|
||||
private double minEnrichmentUnique;
|
||||
private double maxEnrichmentUnique;
|
||||
private double minEnrichmentNonUnique;
|
||||
private double maxEnrichmentNonUnique;
|
||||
private double minEnrichmentTotal;
|
||||
private double maxEnrichmentTotal;
|
||||
private double minUniqueSignalStrandBalance = 0.0;
|
||||
private double maxUniqueSignalStrandBalance = 0.0;
|
||||
private double minNonUniqueSignalStrandBalance = 0.0;
|
||||
private double maxNonUniqueSignalStrandBalance = 0.0;
|
||||
private double minUniqueControlStrandBalance = 0.0;
|
||||
private double maxUniqueControlStrandBalance = 0.0;
|
||||
private double minNonUniqueControlStrandBalance = 0.0;
|
||||
private double maxNonUniqueControlStrandBalance = 0.0;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
int nSams = getToolkit().getArguments().samFiles.size();
|
||||
|
||||
if ( nSams != 2 ) {
|
||||
out.println("ERROR: two input bam files (signal and backround control) must be specified");
|
||||
System.exit(1);
|
||||
}
|
||||
List<Set<String>> readGroupSets = getToolkit().getMergedReadGroupsByReaders();
|
||||
signalReadGroups = readGroupSets.get(0);
|
||||
// System.out.println(signalReadGroups.size()+" read groups in signal");
|
||||
controlReadGroups = readGroupSets.get(1);
|
||||
// System.out.println(controlReadGroups.size()+" read groups in control");
|
||||
signalWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
|
||||
controlWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
|
||||
signalStrandsWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
|
||||
controlStrandsWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
ReadBackedPileup pileup = context.getPileup();
|
||||
List<SAMRecord> reads = pileup.getReads();
|
||||
|
||||
// compute coverages at the current site:
|
||||
PrimitivePair.Int signalCov = new PrimitivePair.Int();
|
||||
PrimitivePair.Int controlCov = new PrimitivePair.Int();
|
||||
PrimitivePair.Int signalFwdStrands = new PrimitivePair.Int();
|
||||
PrimitivePair.Int controlFwdStrands = new PrimitivePair.Int();
|
||||
|
||||
for ( SAMRecord r : reads ) {
|
||||
if ( signalReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) {
|
||||
if ( r.getMappingQuality() == 0 ) {
|
||||
signalCov.second++;
|
||||
if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.second++;
|
||||
}
|
||||
else {
|
||||
signalCov.first++;
|
||||
if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.first++;
|
||||
}
|
||||
} else {
|
||||
if ( controlReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) {
|
||||
if ( r.getMappingQuality() == 0 ) {
|
||||
controlCov.second++;
|
||||
if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.second++;
|
||||
}
|
||||
else {
|
||||
controlCov.first++;
|
||||
if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.first++;
|
||||
}
|
||||
} else {
|
||||
throw new StingException("Read "+r+" belongs to unknown read group ("+r.getReadGroup()+")");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GenomeLoc loc = context.getLocation();
|
||||
|
||||
// if ( curContig != 0 ) System.out.println(loc+" "+signalCov.first+" "+signalCov.second+" "+controlCov.first+" "+controlCov.second);
|
||||
|
||||
if ( loc.getContigIndex() != curContig || loc.getStart() >= windowStop+WINDOW_SIZE ) {
|
||||
// we jumped to the next contig, or we are on the same contig but the current position is
|
||||
// more than WINDOW_SIZE away from the current window's end (i.e. there's nothing to shift)
|
||||
checkCurrentWindow(true);
|
||||
|
||||
if ( loc.getContigIndex() != curContig ) {
|
||||
System.out.println("on contig "+loc.getContig());
|
||||
}
|
||||
curContig = loc.getContigIndex();
|
||||
curContigName = loc.getContig();
|
||||
// prevPos = loc.getStart();
|
||||
windowStart = loc.getStart();
|
||||
windowStop = windowStart + WINDOW_SIZE - 1;
|
||||
signalWindow.clear();
|
||||
controlWindow.clear();
|
||||
totalSignalCoverage.assignFrom( signalCov );
|
||||
totalControlCoverage.assignFrom( controlCov );
|
||||
totalSignalFwdStrands.assignFrom( signalFwdStrands );
|
||||
totalControlFwdStrands.assignFrom( controlFwdStrands );
|
||||
signalWindow.set(0,signalCov);
|
||||
controlWindow.set(0,controlCov);
|
||||
signalStrandsWindow.set(0,signalFwdStrands);
|
||||
controlStrandsWindow.set(0,controlFwdStrands);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// offset of the current position w.r.t. the start of the window:
|
||||
int offset = (int)(loc.getStart() - windowStart);
|
||||
|
||||
if ( offset >= WINDOW_SIZE ) {
|
||||
// if we are here, the current position is outside of the current window, but not
|
||||
// far enough so that we'd need to reinitialize the window from scratch (that was already checked above).
|
||||
// Now we need to shift.
|
||||
|
||||
// We are receiving covered positions in order, so we are guaranteed that everything prior to
|
||||
// the current position was already counted; if some elements of the windows are still nulls, it means
|
||||
// there was no coverage there
|
||||
|
||||
int shift = offset - WINDOW_SIZE + 1;
|
||||
|
||||
// scroll the window(s) base by base until the current position is inside the window. At each step
|
||||
// we will check if the window meets the requirements and should be printed out.
|
||||
for ( int i = 0 ; i < shift ; i++ ) {
|
||||
|
||||
// we are going to shift; check if the window as it is now is worth printing
|
||||
checkCurrentWindow(false);
|
||||
|
||||
// discard coverage from the first element of the window (this element is about to be shifted out of scope)
|
||||
if ( signalWindow.get(0) != null ) totalSignalCoverage.subtract(signalWindow.get(0));
|
||||
if ( signalStrandsWindow.get(0) != null ) totalSignalFwdStrands.subtract(signalStrandsWindow.get(0));
|
||||
|
||||
if ( controlWindow.get(0) != null ) totalControlCoverage.subtract(controlWindow.get(0));
|
||||
if ( controlStrandsWindow.get(0) != null ) totalControlFwdStrands.subtract(controlStrandsWindow.get(0));
|
||||
|
||||
// advnace window coordinates on the ref
|
||||
windowStart++;
|
||||
windowStop++;
|
||||
|
||||
// shift the data in the window(s):
|
||||
signalWindow.shiftData(1);
|
||||
controlWindow.shiftData(1);
|
||||
signalStrandsWindow.shiftData(1);
|
||||
controlStrandsWindow.shiftData(1);
|
||||
|
||||
offset--; // this is the new offset w.r.t. to the shifted window
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// at this point, either the current position was inside the current window, or it was outside,
|
||||
// but the window was already shifted
|
||||
totalSignalCoverage.add(signalCov);
|
||||
totalControlCoverage.add(controlCov);
|
||||
totalSignalFwdStrands.add(signalFwdStrands);
|
||||
totalControlFwdStrands.add(controlFwdStrands);
|
||||
signalWindow.set(offset,signalCov);
|
||||
controlWindow.set(offset,controlCov);
|
||||
signalStrandsWindow.set(offset,signalFwdStrands);
|
||||
controlStrandsWindow.set(offset,controlFwdStrands);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations.
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
return 0; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return accumulator with result of the map taken into account.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum+value; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
printRegion();
|
||||
super.onTraversalDone(result);
|
||||
}
|
||||
|
||||
/** Checks if the currently held window satisfies the conditions set up for significance, and invokes buffered printout if so.
|
||||
* If the parameter is set to true, printout of previously held region is forced, and the buffer is reinitialized with
|
||||
* the new window if it passes the cutoffs, or left empty.
|
||||
*
|
||||
*/
|
||||
private void checkCurrentWindow(boolean force) {
|
||||
if ( force ) printRegion();
|
||||
if ( signalWindow.get(0) == null && controlWindow.get(0) == null ) return; // do not emit windows that start from empty cell; we will get them later
|
||||
if ( totalControlCoverage.first * ENRICHMENT_CUTOFF / 36.0 < MIN_SIGNAL ) { // control coverage zero or too low
|
||||
if ( totalSignalCoverage.first /28.0 > MIN_SIGNAL ) emitWindow(false); // require at least MIN_SIGNAL coverage for signal
|
||||
return;
|
||||
}
|
||||
|
||||
// if we have decent coverage in control, just check for required enrichment in the signal
|
||||
if ( ((double)totalSignalCoverage.first/28.0) / (totalControlCoverage.first/36.0) > ENRICHMENT_CUTOFF ) emitWindow(false);
|
||||
}
|
||||
|
||||
/** This is actually a delayed print command: it buffers the successive windows set for printout, merges the windows that
|
||||
* are close enough and prints only when a train of close-by windows has ended and next window received is far enough
|
||||
*/
|
||||
private void emitWindow(boolean force) {
|
||||
|
||||
if ( regionStart == -1 ) {
|
||||
resetBuffer();
|
||||
return;
|
||||
}
|
||||
|
||||
if ( force || windowStart > lastWindowStart + WINDOW_SIZE ) {
|
||||
// new window is far enough from the region we were buffering: emit old region
|
||||
|
||||
printRegion();
|
||||
resetBuffer();
|
||||
return;
|
||||
}
|
||||
|
||||
// current window is too close (overlapping) with a previous one: we need to merge
|
||||
|
||||
lastWindowStart = windowStart;
|
||||
maxSignalReads.first = Math.max(maxSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0));
|
||||
maxSignalReads.second = Math.max(maxSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0));
|
||||
minSignalReads.first = Math.min(minSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0));
|
||||
minSignalReads.second = Math.min(minSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0));
|
||||
maxControlReads.first = Math.max(maxControlReads.first,(int)Math.round(totalControlCoverage.first/36.0));
|
||||
maxControlReads.second = Math.max(maxControlReads.second,(int)Math.round(totalControlCoverage.second/36.0));
|
||||
minControlReads.first = Math.min(minControlReads.first,(int)Math.round(totalControlCoverage.first/36.0));
|
||||
minControlReads.second = Math.min(minControlReads.second,(int)Math.round(totalControlCoverage.second/36.0));
|
||||
maxEnrichmentUnique = Math.max(maxEnrichmentUnique,((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0));
|
||||
minEnrichmentUnique = Math.min(minEnrichmentUnique, ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0));
|
||||
maxEnrichmentNonUnique = Math.max(maxEnrichmentNonUnique,((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0));
|
||||
minEnrichmentNonUnique = Math.min( minEnrichmentNonUnique, ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0) );
|
||||
maxEnrichmentTotal = Math.max( maxEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
|
||||
((totalControlCoverage.first+ totalControlCoverage.second)/36.0) );
|
||||
minEnrichmentTotal = Math.min( minEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
|
||||
((totalControlCoverage.first+ totalControlCoverage.second)/36.0) );
|
||||
|
||||
|
||||
maxUniqueSignalStrandBalance = Math.max(maxUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first);
|
||||
minUniqueSignalStrandBalance = Math.min(minUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first);
|
||||
maxNonUniqueSignalStrandBalance = Math.max(maxNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second);
|
||||
minNonUniqueSignalStrandBalance = Math.min(minNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second);
|
||||
maxUniqueControlStrandBalance = Math.max(maxUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first);
|
||||
minUniqueControlStrandBalance = Math.min(minUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first);
|
||||
maxNonUniqueControlStrandBalance = Math.max(maxNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second);
|
||||
minNonUniqueControlStrandBalance = Math.min(minNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second);
|
||||
|
||||
|
||||
}
|
||||
|
||||
private void resetBuffer() {
|
||||
regionStart = windowStart;
|
||||
lastWindowStart = windowStart;
|
||||
maxSignalReads.first = (int)Math.round(totalSignalCoverage.first/28.0);
|
||||
maxSignalReads.second = (int)Math.round(totalSignalCoverage.second/28.0);
|
||||
minSignalReads.assignFrom(maxSignalReads);
|
||||
maxControlReads.first = (int)Math.round(totalControlCoverage.first/36.0);
|
||||
maxControlReads.second = (int)Math.round(totalControlCoverage.second/36.0);
|
||||
minControlReads.assignFrom(maxControlReads);
|
||||
minEnrichmentUnique = maxEnrichmentUnique = ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0);
|
||||
minEnrichmentNonUnique = maxEnrichmentNonUnique = ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0);
|
||||
minEnrichmentTotal = maxEnrichmentTotal = ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
|
||||
((totalControlCoverage.first+ totalControlCoverage.second)/36.0);
|
||||
|
||||
minUniqueSignalStrandBalance = maxUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.first)/totalSignalCoverage.first;
|
||||
minNonUniqueSignalStrandBalance = maxNonUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.second)/totalSignalCoverage.second;
|
||||
minUniqueControlStrandBalance = maxUniqueControlStrandBalance = ((double)totalControlFwdStrands.first)/totalControlCoverage.first;
|
||||
minNonUniqueControlStrandBalance = maxNonUniqueControlStrandBalance = ((double)totalControlFwdStrands.second)/totalControlCoverage.second;
|
||||
}
|
||||
|
||||
private void printRegion() {
|
||||
if ( regionStart == -1 ) return;
|
||||
out.print(curContigName+":"+regionStart+"-"+windowStop+"\t"+(windowStop-regionStart+1) +"\t"+
|
||||
minSignalReads.first+"-"+maxSignalReads.first+"\t"+
|
||||
minSignalReads.second+"-"+maxSignalReads.second+"\t"+
|
||||
minControlReads.first+"-"+maxControlReads.first+"\t"+
|
||||
minControlReads.second+"-"+maxControlReads.second+"\t");
|
||||
out.printf("%.2f-%.2f\t",minEnrichmentUnique,maxEnrichmentUnique);
|
||||
out.printf("%.2f-%.2f\t",minEnrichmentNonUnique,maxEnrichmentNonUnique);
|
||||
out.printf("%.2f-%.2f\t",minEnrichmentTotal,maxEnrichmentTotal);
|
||||
out.printf("%.2f-%.2f\t",minUniqueSignalStrandBalance,maxUniqueSignalStrandBalance);
|
||||
out.printf("%.2f-%.2f\t",minNonUniqueSignalStrandBalance,maxNonUniqueSignalStrandBalance);
|
||||
out.printf("%.2f-%.2f\t",minUniqueControlStrandBalance,maxUniqueControlStrandBalance);
|
||||
out.printf("%.2f-%.2f",minNonUniqueControlStrandBalance,maxNonUniqueControlStrandBalance);
|
||||
|
||||
if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER");
|
||||
out.println();
|
||||
|
||||
regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty
|
||||
}
|
||||
}
|
||||
|
|
@ -1,244 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.duplicates.DupUtils;
|
||||
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
class MismatchCounter {
|
||||
long nObs = 0;
|
||||
long nMismatches = 0;
|
||||
|
||||
public void inc(long incNObs, long incNMismatches) {
|
||||
nObs += incNObs;
|
||||
nMismatches += incNMismatches;
|
||||
}
|
||||
|
||||
public void inc(boolean mismatchP) {
|
||||
inc(1, mismatchP ? 1 : 0);
|
||||
}
|
||||
|
||||
|
||||
public double mismatchRate() {
|
||||
return (double)nMismatches / nObs;
|
||||
}
|
||||
|
||||
public byte empiricalQualScore() {
|
||||
return QualityUtils.probToQual(1 - mismatchRate(), 0);
|
||||
}
|
||||
|
||||
public String headerString() {
|
||||
return "mismatchRate\tempiricalQ\tnObs\tnMismatches";
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%.10f\t%d\t%d\t%6d", mismatchRate(), empiricalQualScore(), nObs, nMismatches);
|
||||
}
|
||||
}
|
||||
|
||||
class QualityTracker {
|
||||
final private int MAX_QUAL_SCORE = 100;
|
||||
MismatchCounter[][] mismatchesByQ = new MismatchCounter[MAX_QUAL_SCORE][MAX_QUAL_SCORE];
|
||||
|
||||
public QualityTracker() {
|
||||
for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
|
||||
for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
|
||||
mismatchesByQ[i][j] = new MismatchCounter();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void inc(int b1Qi, int b2Qi, boolean mismatchP, boolean orderDependent) {
|
||||
int b1Q = orderDependent ? b1Qi : Math.max(b1Qi, b2Qi);
|
||||
int b2Q = orderDependent ? b2Qi : Math.min(b1Qi, b2Qi);
|
||||
|
||||
if ( b1Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b1Q);
|
||||
if ( b2Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b2Q);
|
||||
|
||||
mismatchesByQ[b1Q][b2Q].inc(mismatchP);
|
||||
}
|
||||
|
||||
public void inc(DuplicateComp dc, boolean orderDependent) {
|
||||
inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP(), orderDependent);
|
||||
}
|
||||
|
||||
public int probMismatchQ1Q2(int q1, int q2) {
|
||||
double e1 = 1 - QualityUtils.qualToProb(q1);
|
||||
double e2 = 1 - QualityUtils.qualToProb(q2);
|
||||
double eMM = e1 * (1 - e2) + (1 - e1) * e2 - 1/3 * e1 * e2;
|
||||
return QualityUtils.probToQual(1 - eMM, 0.0);
|
||||
}
|
||||
|
||||
public void printToStream(PrintStream out, boolean filterUnobserved) {
|
||||
out.printf("Q1\tQ2\tQmin\t%s%n", mismatchesByQ[0][0].headerString());
|
||||
for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
|
||||
for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
|
||||
MismatchCounter mc = mismatchesByQ[i][j];
|
||||
//System.out.printf("MC = %s%n", mc);
|
||||
if ( filterUnobserved && mc.nObs == 0 )
|
||||
continue;
|
||||
out.printf("%d\t%d\t%d\t%s\t%n", i, j, probMismatchQ1Q2(i,j), mc.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, QualityTracker> {
|
||||
@Argument(fullName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data")
|
||||
public boolean FILTER_UNOBSERVED_QUALS = false;
|
||||
|
||||
@Argument(fullName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets")
|
||||
public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET = 100;
|
||||
|
||||
@Argument(fullName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
|
||||
public boolean COMBINE_QUALS = false;
|
||||
|
||||
@Argument(fullName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities")
|
||||
public boolean COMBINE_ALL_DUPS = false;
|
||||
|
||||
@Argument(fullName="orderDependent", required=false, doc="")
|
||||
public boolean orderDependent = false;
|
||||
|
||||
@Argument(fullName="compareToUniqueReads", required=false, doc="If true, then we will compare only to unique (i.e., non-duplicated molecules) at the same duplicate site")
|
||||
public boolean compareToUniqueReads = false;
|
||||
|
||||
@Argument(fullName="comparePairToSingleton", required=false, doc="If true, then we will compare a combined dup to a random other read in the duplicate set, not a combined pair itself")
|
||||
public boolean comparePairToSingleton = false;
|
||||
|
||||
final boolean DEBUG = false;
|
||||
final private boolean ACTUALLY_DO_WORK = true;
|
||||
|
||||
public void onTraversalDone(QualityTracker result) {
|
||||
result.printToStream(out, FILTER_UNOBSERVED_QUALS);
|
||||
}
|
||||
|
||||
public QualityTracker reduceInit() {
|
||||
return new QualityTracker();
|
||||
}
|
||||
|
||||
public QualityTracker reduce(List<DuplicateComp> dupComps, QualityTracker tracker) {
|
||||
for ( DuplicateComp dc : dupComps ) {
|
||||
tracker.inc(dc, orderDependent);
|
||||
}
|
||||
|
||||
return tracker;
|
||||
}
|
||||
|
||||
// Print out data for regression
|
||||
public List<DuplicateComp> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
|
||||
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
|
||||
List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>();
|
||||
|
||||
// todo -- fixme -- the logic here is all wrong given new interface
|
||||
// if ( ! ACTUALLY_DO_WORK )
|
||||
// return pairwiseComps;
|
||||
//
|
||||
// if ( COMBINE_QUALS ) {
|
||||
// Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
|
||||
// if ( combinedReads != null ) {
|
||||
// SAMRecord combined1 = combinedReads.first;
|
||||
// SAMRecord combined2 = combinedReads.second;
|
||||
//
|
||||
// if ( comparePairToSingleton )
|
||||
// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
|
||||
// else
|
||||
// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
|
||||
// }
|
||||
// } else {
|
||||
// int nComparisons = 0;
|
||||
// for ( SAMRecord read1 : duplicateReads ) {
|
||||
// for ( SAMRecord read2 : duplicateReads ) {
|
||||
// if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
|
||||
// // the hashcode insures we don't do A vs. B and B vs. A
|
||||
// //System.out.printf("Comparing %s against %s%n", read1, read2);
|
||||
// nComparisons++;
|
||||
// pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
|
||||
// if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
return pairwiseComps;
|
||||
}
|
||||
|
||||
private List<DuplicateComp> addPairwiseMatches(List<DuplicateComp> comps,
|
||||
SAMRecord read1, SAMRecord read2,
|
||||
List<SAMRecord> uniqueReads ) {
|
||||
if ( compareToUniqueReads ) {
|
||||
// we want to compare to a read in the unique read set
|
||||
if ( uniqueReads.size() > 0 ) { // there's actually something to compare to
|
||||
SAMRecord uniqueRead = uniqueReads.get(0); // might as well get the first one
|
||||
return pairwiseMatches(comps, read1, uniqueRead);
|
||||
} else {
|
||||
return comps;
|
||||
}
|
||||
} else {
|
||||
// default, just do read1 vs. read2
|
||||
return pairwiseMatches(comps, read1, read2);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the pairwise mismatches between reads read1 and read2 and adds the result to the comps list.
|
||||
* Doesn't contain any logic deciding what to compare, just does read1 and read2
|
||||
*
|
||||
* @param comps
|
||||
* @param read1
|
||||
* @param read2
|
||||
* @return
|
||||
*/
|
||||
private List<DuplicateComp> pairwiseMatches(List<DuplicateComp> comps, SAMRecord read1, SAMRecord read2 ) {
|
||||
byte[] read1Bases = read1.getReadBases();
|
||||
byte[] read1Quals = read1.getBaseQualities();
|
||||
byte[] read2Bases = read2.getReadBases();
|
||||
byte[] read2Quals = read2.getBaseQualities();
|
||||
|
||||
for ( int i = 0; i < read1Bases.length; i++) {
|
||||
byte qual1 = read1Quals[i];
|
||||
byte qual2 = read2Quals[i];
|
||||
boolean mismatchP = ! BaseUtils.basesAreEqual(read1Bases[i], read2Bases[i]);
|
||||
DuplicateComp dc = new DuplicateComp(qual1, qual2, mismatchP);
|
||||
comps.add(dc);
|
||||
}
|
||||
|
||||
return comps;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,193 +0,0 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.poolseq.PowerBelowFrequencyWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Nov 12, 2009
|
||||
* Time: 12:31:58 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class HapmapPoolAllelicInfoWalker extends LocusWalker<String, PrintWriter> {
|
||||
@Argument(fullName="outputFile", shortName="of", doc="File to write to", required=true)
|
||||
public String outputFileString = null;
|
||||
@Argument(fullName="numIndividualsInPool", shortName="ps",doc="Pool size",required = true)
|
||||
public int poolSize = -1;
|
||||
@Argument(fullName="sampleNames", shortName="samples", doc="Sample name bindings", required=true)
|
||||
public String sampleNameFile = null;
|
||||
@Argument(fullName="minCallQuality", shortName="q", doc="Ignore calls with below this quality, defaults to -1")
|
||||
public double minCallQ = -1;
|
||||
|
||||
private PrintWriter output;
|
||||
private static double EPSILON = Math.pow(10,-4);
|
||||
private String[] sampleNames = null;
|
||||
private PowerBelowFrequencyWalker powerWalker = null;
|
||||
private ConcordanceTruthTable ctt = null;
|
||||
|
||||
public void initialize() {
|
||||
sampleNames = generateNameTableFromFile(sampleNameFile);
|
||||
powerWalker = new PowerBelowFrequencyWalker();
|
||||
powerWalker.initialize();
|
||||
powerWalker.setPoolSize(poolSize);
|
||||
ctt = new ConcordanceTruthTable(poolSize);
|
||||
}
|
||||
|
||||
public PrintWriter reduceInit() {
|
||||
try {
|
||||
output = new PrintWriter(outputFileString);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("File "+outputFileString+" could not be opened.", e);
|
||||
}
|
||||
output.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Num_Chips","Depth","Power","Support","Called");
|
||||
//System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Depth","Power","Support","Called");
|
||||
return output;
|
||||
}
|
||||
|
||||
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
GenomeLoc loc = context.getLocation();
|
||||
String chrom = loc.getContig();
|
||||
long pos = loc.getStart();
|
||||
char refBase = Character.toUpperCase(ref.getBase());
|
||||
List<Pair<Genotype, Genotype>> chips = getChips(sampleNames, tracker);
|
||||
Pair<Integer,Pair<Integer,Integer>> alleleFreqInfo = ctt.getPooledAlleleFrequency(chips,refBase);
|
||||
char alternate;
|
||||
if ( alleleFreqInfo.first == ConcordanceTruthTable.VARIANT ) {
|
||||
//System.out.println(refBase + " " + alleleFreqInfo.getFirst().getBases());
|
||||
alternate = getAlternateBase(chips,refBase);
|
||||
|
||||
} else {
|
||||
return null; // early return
|
||||
}
|
||||
int numVariantAllele = alleleFreqInfo.getSecond().getFirst();
|
||||
int numChipsObserved = alleleFreqInfo.getSecond().getSecond();
|
||||
int depth = context.size();
|
||||
double power = powerWalker.calculatePowerAtFrequency(context,numVariantAllele);
|
||||
int called;
|
||||
|
||||
Variation call = tracker.lookup("calls",Variation.class);
|
||||
if ( call == null ) {
|
||||
called = 0;
|
||||
} else if ( call.isReference() || call.getNegLog10PError() < minCallQ-EPSILON ) {
|
||||
called = 0;
|
||||
} else {
|
||||
called = 1;
|
||||
}
|
||||
|
||||
ReadBackedPileup p = context.getPileup();
|
||||
int support = p.getBaseCounts()[BaseUtils.simpleBaseToBaseIndex(alternate)];
|
||||
|
||||
// sanity check
|
||||
if ( refBase == alternate ) {
|
||||
if ( alleleFreqInfo.first == ConcordanceTruthTable.VARIANT ) {
|
||||
;//logger.warn("Called as a variant! Ref: "+ refBase +"Chip data: " + alleleFreqInfo.getFirst().getBases());
|
||||
}
|
||||
}
|
||||
|
||||
return String.format("%s\t%d\t%c\t%c\t%d\t%d\t%d\t%f\t%d\t%d",chrom,pos,refBase,alternate,numVariantAllele,numChipsObserved,depth,power,support,called);
|
||||
|
||||
}
|
||||
|
||||
public char getAlternateBase(List<Pair<Genotype, Genotype>> chips, char ref) {
|
||||
for ( Pair<Genotype, Genotype> chip : chips ) {
|
||||
Genotype g = chip.first;
|
||||
char[] bases = g.getBases().toCharArray();
|
||||
if ( Character.toUpperCase(bases[0]) != ref )
|
||||
return bases[0];
|
||||
if ( Character.toUpperCase(bases[1]) != ref )
|
||||
return bases[1];
|
||||
}
|
||||
return ref;
|
||||
}
|
||||
|
||||
public PrintWriter reduce(String s, PrintWriter p) {
|
||||
if ( s == null ) {
|
||||
// do nothing
|
||||
return p;
|
||||
} else {
|
||||
//System.out.printf("%s%n",s);
|
||||
output.printf("%s%n",s);
|
||||
return p;
|
||||
}
|
||||
}
|
||||
|
||||
public void onTraversalDone(PrintWriter p) {
|
||||
output.close();
|
||||
}
|
||||
|
||||
private List<Pair<Genotype,Genotype>> getChips(String[] rodNames, RefMetaDataTracker tracker) {
|
||||
List<Pair<Genotype, Genotype>> chips = new ArrayList <Pair<Genotype,Genotype>>(rodNames.length);
|
||||
for ( String name : rodNames ) {
|
||||
List<Object> rods = tracker.getReferenceMetaData(name);
|
||||
Variation chip = (rods.size() == 0 ? null : (Variation)rods.get(0));
|
||||
if ( chip != null ) {
|
||||
// chips must be Genotypes
|
||||
if ( !(chip instanceof VariantBackedByGenotype) )
|
||||
throw new StingException("Failure: trying to analyze genotypes using non-genotype truth data");
|
||||
chips.add(new Pair<Genotype,Genotype>(((VariantBackedByGenotype)chip).getCalledGenotype(),null));
|
||||
}
|
||||
}
|
||||
|
||||
return chips;
|
||||
}
|
||||
// private methods for reading in names from a file
|
||||
|
||||
private String[] generateNameTableFromFile(String file) {
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader(new FileReader(file));
|
||||
} catch( FileNotFoundException e) {
|
||||
String errMsg = "Hapmap pool file at "+file+" was not found. Please check filepath.";
|
||||
throw new StingException(errMsg, e);
|
||||
}
|
||||
|
||||
LinkedList<String> nameList = new LinkedList<String>();
|
||||
|
||||
while(continueReading(reader)) {
|
||||
String line = readLine(reader);
|
||||
nameList.add(line);
|
||||
}
|
||||
|
||||
return nameList.toArray(new String[nameList.size()]);
|
||||
}
|
||||
|
||||
private boolean continueReading(BufferedReader reader) {
|
||||
boolean continueReading;
|
||||
try {
|
||||
continueReading = reader.ready();
|
||||
} catch(IOException e) {
|
||||
continueReading = false;
|
||||
}
|
||||
return continueReading;
|
||||
}
|
||||
|
||||
private String readLine(BufferedReader reader) {
|
||||
String line;
|
||||
try {
|
||||
line = reader.readLine();
|
||||
} catch( IOException e) {
|
||||
String errMsg = "BufferedReader pointing to "+reader.toString()+" was declared ready but no line could be read from it.";
|
||||
throw new StingException(errMsg,e);
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,37 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broad.tribble.vcf.VCFHeaderLineType;
|
||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class QualityAdjustedSecondBaseLod implements InfoFieldAnnotation, ExperimentalAnnotation {
|
||||
private final String KEY_NAME = "Qual_Adjusted_2blod";
|
||||
private final double CHI_LOD_MAX = -1000.0;
|
||||
private final SecondBaseSkew skewCalc = new SecondBaseSkew();
|
||||
private final double log10e = Math.log10(Math.E);
|
||||
private final double log10half = Math.log10(1.0/2);
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList(KEY_NAME); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(KEY_NAME, 1, VCFHeaderLineType.Float, "Adjusted residual quality based on second-base skew")); }
|
||||
|
||||
public Map<String, Object> annotate( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> contexts, VariantContext vc) {
|
||||
String chi = skewCalc.getAnnotation(ref, contexts, vc);
|
||||
if ( chi == null )
|
||||
return null;
|
||||
double chi_square = Double.valueOf(chi);
|
||||
double chi_loglik = chi_square <= 0.0 ? 0.0 : Math.max(-(chi_square/2.0)*log10e + log10half,CHI_LOD_MAX); // cap it...
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%f", 10*(vc.getNegLog10PError() + chi_loglik)));
|
||||
return map;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,115 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broad.tribble.vcf.VCFHeaderLineType;
|
||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
|
||||
public class SecondBaseSkew implements InfoFieldAnnotation, ExperimentalAnnotation {
|
||||
private final static double epsilon = Math.pow(10.0,-12.0);
|
||||
private final static String KEY_NAME = "2b_Chi";
|
||||
private final static double[] UNIFORM_ON_OFF_RATIO = {1.0/3.0, 2.0/3.0};
|
||||
private double[] proportionExpectations = UNIFORM_ON_OFF_RATIO;
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList(KEY_NAME); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(KEY_NAME, 1, VCFHeaderLineType.Float, "Chi-square Secondary Base Skew")); }
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( stratifiedContexts.size() == 0 )
|
||||
return null;
|
||||
|
||||
String annotation = getAnnotation(ref, stratifiedContexts, vc);
|
||||
if ( annotation == null )
|
||||
return null;
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), annotation);
|
||||
return map;
|
||||
}
|
||||
|
||||
public String getAnnotation(ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( !vc.isBiallelic() || !vc.isSNP() )
|
||||
return null;
|
||||
|
||||
char alternate = vc.getAlternateAllele(0).toString().charAt(0);
|
||||
|
||||
Pair<Integer, Integer> depth = new Pair<Integer, Integer>(0, 0);
|
||||
for ( String sample : stratifiedContexts.keySet() ) {
|
||||
//Pair<Integer,Integer> sampleDepth = getSecondaryPileupNonrefCount(ref.getBase(),stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getPileup(), alternate);
|
||||
Pair<Integer, Integer> sampleDepth = getSecondaryPileupNonrefCount(ref.getBaseAsChar(), stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(), alternate);
|
||||
depth.first += sampleDepth.first;
|
||||
depth.second += sampleDepth.second;
|
||||
}
|
||||
|
||||
if ( depth.first == 0 )
|
||||
return null;
|
||||
|
||||
double biasedProportion = (1.0 + depth.second) / (1.0 + depth.first);
|
||||
double p_transformed = transform(biasedProportion, depth.first+1);
|
||||
double expected_transformed = transform(proportionExpectations[0], depth.first+1);
|
||||
double chi_square = Math.signum(biasedProportion - proportionExpectations[0])*Math.min(Math.pow(p_transformed - expected_transformed, 2), Double.MAX_VALUE);
|
||||
return String.format("%f", chi_square);
|
||||
}
|
||||
|
||||
private double transform( double proportion, int depth ) {
|
||||
proportion = proportion - epsilon;
|
||||
return proportion / ( Math.sqrt ( proportion*(1-proportion)/depth ) );
|
||||
}
|
||||
|
||||
private Pair<Integer, Integer> getSecondaryPileupNonrefCount(char ref, ReadBackedPileup p, char snp ) {
|
||||
int variantDepth = 0;
|
||||
int variantsWithRefSecondBase = 0;
|
||||
|
||||
for (PileupElement pile : p ) {
|
||||
byte pbase = pile.getBase();
|
||||
byte sbase = pile.getSecondBase();
|
||||
|
||||
if ( BaseUtils.isRegularBase((char)sbase) && BaseUtils.basesAreEqual(pbase, (byte) snp) ) {
|
||||
variantDepth++;
|
||||
if ( BaseUtils.basesAreEqual(sbase, (byte)ref) ) {
|
||||
variantsWithRefSecondBase++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new Pair<Integer, Integer>(variantDepth, variantsWithRefSecondBase);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,217 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.concordance;
|
||||
|
||||
import org.broad.tribble.vcf.VCFGenotypeRecord;
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broad.tribble.vcf.VCFHeaderLine;
|
||||
import org.broad.tribble.vcf.VCFRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.classloader.PackageUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Determines the concordance between multiple VCF call sets at each position.
|
||||
* Users can specify which concordance tests should be run.
|
||||
*/
|
||||
@Requires(value={DataSource.REFERENCE})
|
||||
@Reference(window=@Window(start=-20,stop=20))
|
||||
public class CallsetConcordanceWalker extends RodWalker<Integer, Integer> {
|
||||
@Argument(fullName="concordance_output", shortName="CO", doc="VCF file to which output should be written", required=true)
|
||||
private File OUTPUT = null;
|
||||
@Argument(fullName="concordanceType", shortName="CT", doc="Concordance subset types to apply to given callsets. Syntax: 'type[:key1=arg1,key2=arg2,...]'", required=false)
|
||||
private String[] TYPES = null;
|
||||
@Argument(fullName="list", shortName="ls", doc="List the available concordance types and exit", required=false)
|
||||
private Boolean LIST_ONLY = false;
|
||||
|
||||
|
||||
// the concordance tests to run
|
||||
private ArrayList<ConcordanceType> requestedTypes;
|
||||
|
||||
// VCF writer for the output of the concordance tests
|
||||
private VCFWriter vcfWriter;
|
||||
|
||||
// a map of rod name to uniquified sample name
|
||||
private HashMap<Pair<String, String>, String> rodNamesToSampleNames = new HashMap<Pair<String, String>, String>();
|
||||
|
||||
|
||||
/**
|
||||
* Prepare the output file and the list of available features.
|
||||
*/
|
||||
public void initialize() {
|
||||
|
||||
// get the possible concordance types
|
||||
List<Class<? extends ConcordanceType>> classes = PackageUtils.getClassesImplementingInterface(ConcordanceType.class);
|
||||
|
||||
// print and exit if that's what was requested
|
||||
if ( LIST_ONLY ) {
|
||||
out.println("\nAvailable concordance types:");
|
||||
for (int i = 0; i < classes.size(); i++)
|
||||
out.println("\t" + classes.get(i).getSimpleName());
|
||||
out.println();
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
// get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap)
|
||||
HashSet<String> samples = new HashSet<String>();
|
||||
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames);
|
||||
|
||||
for ( java.util.Map.Entry<Pair<String, String>, String> entry : rodNamesToSampleNames.entrySet() ) {
|
||||
logger.debug("Uniquified sample mapping: " + entry.getKey().first + "/" + entry.getKey().second + " -> " + entry.getValue());
|
||||
}
|
||||
|
||||
// initialize requested concordance types
|
||||
requestedTypes = new ArrayList<ConcordanceType>();
|
||||
if (TYPES != null) {
|
||||
for ( String requestedTypeString : TYPES ) {
|
||||
String[] requestedPieces = requestedTypeString.split(":");
|
||||
String requestedType = requestedPieces[0];
|
||||
|
||||
boolean foundClass = false;
|
||||
for ( Class type : classes ) {
|
||||
|
||||
if (requestedType.equalsIgnoreCase(type.getSimpleName())) {
|
||||
foundClass = true;
|
||||
try {
|
||||
ConcordanceType concordance = (ConcordanceType)type.newInstance();
|
||||
HashMap<String,String> requestedArgs = new HashMap<String,String>();
|
||||
if ( requestedPieces.length == 2 ) {
|
||||
String[] argStrings = requestedPieces[1].split(",");
|
||||
for (int i = 0; i < argStrings.length; i++ ) {
|
||||
String[] arg = argStrings[i].split("=");
|
||||
if ( arg.length == 2 )
|
||||
requestedArgs.put(arg[0], arg[1]);
|
||||
}
|
||||
}
|
||||
|
||||
concordance.initialize(requestedArgs, samples);
|
||||
requestedTypes.add(concordance);
|
||||
break;
|
||||
} catch (InstantiationException e) {
|
||||
throw new StingException(String.format("Cannot instantiate concordance class '%s': must be concrete class", type.getSimpleName()));
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new StingException(String.format("Cannot instantiate concordance class '%s': must have no-arg constructor", type.getSimpleName()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( !foundClass )
|
||||
throw new StingException("The requested concordance type (" + requestedType + ") isn't a valid concordance option");
|
||||
}
|
||||
}
|
||||
|
||||
// set up the header fields
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFHeaderLine("source", "CallsetConcordance"));
|
||||
hInfo.add(new VCFHeaderLine("note", "\"This file represents a concordance test of various call sets - NOT the output from a multi-sample caller\""));
|
||||
hInfo.addAll(getVCFAnnotationDescriptions(requestedTypes));
|
||||
|
||||
vcfWriter = new VCFWriter(OUTPUT);
|
||||
vcfWriter.writeHeader(new VCFHeader(hInfo, samples));
|
||||
}
|
||||
|
||||
public static Set<VCFHeaderLine> getVCFAnnotationDescriptions(Collection<ConcordanceType> types) {
|
||||
|
||||
TreeSet<VCFHeaderLine> descriptions = new TreeSet<VCFHeaderLine>();
|
||||
for ( ConcordanceType type : types )
|
||||
descriptions.add(type.getInfoDescription());
|
||||
|
||||
return descriptions;
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( rodData == null ) // RodWalkers can make funky map calls
|
||||
return 0;
|
||||
|
||||
// get all of the vcf rods at this locus
|
||||
Map<VCFRecord,String> vcfRods = new LinkedHashMap<VCFRecord,String>();
|
||||
Iterator<GATKFeature> rods = rodData.getAllRods().iterator();
|
||||
while (rods.hasNext()) {
|
||||
GATKFeature rod = rods.next();
|
||||
if ( rod.getUnderlyingObject() instanceof VCFRecord ) {
|
||||
if (vcfRods.containsKey(rod)) throw new StingException("Duplicate VCF's found");
|
||||
vcfRods.put((VCFRecord)rod.getUnderlyingObject(),rod.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if ( vcfRods.size() == 0 )
|
||||
return 0;
|
||||
|
||||
// pull out all of the individual calls from the rods and insert into a map based on the
|
||||
// mapping from rod/sample to uniquified name
|
||||
HashMap<String, VCFGenotypeRecord> samplesToRecords = new HashMap<String, VCFGenotypeRecord>();
|
||||
for ( VCFRecord rod : vcfRods.keySet() ) {
|
||||
List<VCFGenotypeRecord> records = rod.getVCFGenotypeRecords();
|
||||
for ( VCFGenotypeRecord vcfRec : records ) {
|
||||
String uniquifiedSample = rodNamesToSampleNames.get(new Pair<String, String>(vcfRods.get(rod), vcfRec.getSampleName()));
|
||||
if ( uniquifiedSample == null )
|
||||
throw new StingException("Unexpected sample encountered: " + vcfRec.getSampleName() + " in rod " + vcfRods.get(rod));
|
||||
|
||||
samplesToRecords.put(uniquifiedSample, vcfRec);
|
||||
}
|
||||
}
|
||||
|
||||
// create a merged record from all input VCFs
|
||||
VCFRecord record = VCFUtils.mergeRecords(vcfRods, rodNamesToSampleNames);
|
||||
|
||||
// add in the info fields to the new record based on the results of each of the relevant concordance tests
|
||||
for ( ConcordanceType type : requestedTypes ) {
|
||||
String result = type.computeConcordance(samplesToRecords, ref);
|
||||
if ( result != null ) {
|
||||
record.addInfoField(type.getInfoName(), result);
|
||||
}
|
||||
}
|
||||
|
||||
// emit the new record
|
||||
vcfWriter.addRecord(record);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
vcfWriter.close();
|
||||
out.printf("Processed %d loci.\n", result);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.concordance;
|
||||
|
||||
import org.broad.tribble.vcf.VCFGenotypeRecord;
|
||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
public interface ConcordanceType {
|
||||
|
||||
public void initialize(Map<String,String> args, Set<String> samples);
|
||||
public String computeConcordance(Map<String, VCFGenotypeRecord> samplesToRecords, ReferenceContext ref);
|
||||
public String getInfoName();
|
||||
public VCFInfoHeaderLine getInfoDescription();
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue