Reorganized the codebase beneath top-level public and private directories,

removing the playground and oneoffprojects directories in the process. Updated
build.xml accordingly.
This commit is contained in:
David Roazen 2011-06-28 06:55:19 -04:00
parent b46279d62e
commit 3c9497788e
1543 changed files with 71 additions and 89047 deletions

View File

@ -1,461 +0,0 @@
#These functions each make a page for the ADPR. They assume a pdf with the following parameters for best formatting:
#pdf(file=paste(sample_sets, ".pdf", sep=""), width=22, height=15, pagecentre=TRUE, pointsize=24)
library(gplots)
library(ReadImages)
##defaults<-par(no.readonly = TRUE)
tearsheet<-function(lanetable, sampletable, variant, Protocol, Sequencer){
#define layout
layout(matrix(c(1,1,2,4,3,5), ncol=2, nrow=3, byrow=TRUE), heights=c(1, 2.5,2.5,), respect=FALSE)
#prep for title bar
title=paste(sample_sets, ": TEAR SHEET", sep="")
drop<-read.jpeg("tearsheetdrop.jpg")
#plot title bar
par(mar=c(0,0,0,0))
plot(drop)
text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
#calc by lane stuff
sdlane<-rep("NA", 6)
meanlane<-sdlane
attach(lanetable);
callable.target<-HS_TARGET_TERRITORY[1];
singlelanes<-length(which(Lane.Type=="Single"));
pairedlanes<-length(which(Lane.Type=="Paired"));
meanlane[1]<-round(mean(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2);
sdlane[1]<-round(sd(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2);
meanlane[2]<-round(mean(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2);
sdlane[2]<-round(sd(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2);
meanlane[3]<-round(mean(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
sdlane[3]<-round(sd(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
meanlane[4]<-round(mean(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
meanlane[5]<-round(mean(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
meanlane[6]<-round(mean(HS_PCT_TARGET_BASES_30X, na.rm=TRUE));
sdlane[4]<-round(sd(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
sdlane[5]<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
sdlane[6]<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE))
names<-paste(Flowcell, "-", Lane, sep="")
detach(lanetable)
meansamp<-rep("NA", 6)
sdsamp<-meansamp
#Calc by sample metrics
attach(bysample);
baits<-Bait.Set[1]
alllanes<-signif(sum(X..Lanes.included.in.aggregation, na.rm = TRUE))
mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE));
sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE));
mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE));
sd.mrl.samp<-signif(sd(Mean.Read.Length, na.rm=TRUE));
meansamp[1]<-round(mean(Total.Reads, na.rm=TRUE)/10^6, 2);
sdsamp[1]<-round(sd(Total.Reads, na.rm=TRUE)/10^6, 2);
meansamp[2]<-round(mean(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2);
sdsamp[2]<-round(sd(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2);
meansamp[3]<-round(mean(Mean.Target.Coverage..HS., na.rm=TRUE));
sdsamp[3]<-round(sd(Mean.Target.Coverage..HS., na.rm=TRUE));
meansamp[4]<-round(mean(PCT.Target.Bases.10x..HS., na.rm=TRUE));
meansamp[5]<-round(mean(PCT.Target.Bases.20x..HS., na.rm=TRUE));
meansamp[6]<-round(mean(PCT.Target.Bases.30x..HS., na.rm=TRUE));
sdsamp[4]<-round(sd(PCT.Target.Bases.10x..HS., na.rm=TRUE));
sdsamp[5]<-round(sd(PCT.Target.Bases.20x..HS., na.rm=TRUE));
sdsamp[6]<-round(sd(PCT.Target.Bases.30x..HS., na.rm=TRUE));
detach(bysample);
#calc variant stuff
attach(variant)
SNPS<-c(ti_count[which(filter_name=="called")]+tv_count[which(filter_name=="called")])
titvs<-c(ti.tv_ratio[which(filter_name=="called")])
detach(variant)
#prep stuff.
summary<-c(nrow(bysample), Protocol, baits, paste(callable.target, "bases"))
summary2<-c(Sequencer, alllanes, paste(mean.lanes.samp, "+/-", sd.lanes.samp), paste(singlelanes, "single lanes,", pairedlanes, "paired lanes"), paste(mean.mrl.samp, "+/-", sd.mrl.samp))
samps<-paste(meansamp, c("M", "M", "x", "%", "%", "%"), " +/- ", sdsamp, c("M", "M", "x", "%", "%", "%"), sep="")
lanes<-paste(meanlane, c("M", "M", "x", "%", "%", "%"), " +/- ", sdlane, c("M", "M", "x", "%", "%", "%"), sep="")
#print out 4 tables in R
table1<-cbind(summary)
rownames(table1)<-c("Samples","Sequencing Protocol", "Bait Design","Callable Target")
par(mar=c(4,4,4,4))
textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.75)
title(main="Project Summary", family="sans", cex.main=2)
table2<-cbind(lanes, samps)
colnames(table2)<-c("per lane", "per sample")
rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 10x")
par(mar=c(4,4,4,4))
textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25)
title(main="Bases Summary", family="sans", cex.main=1.75)
table3<-cbind(summary2)
rownames(table3)<-c("Sequencer", "Used lanes", "Used lanes per sample", "Lane pariteies", "Read legnths")
par(mar=c(4,4,4,4))
textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25)
title(main="Sequencing Summary", family="sans", cex.main=1.75)
table4<-cbind(SNPS, titvs)
rownames(table4)<-c("All SNPs", "Known SNPs", "Novel SNPs")
colnames(table4)<-c("SNPs Found", "Ti/Tv")
textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25)
title(main="Variant Summary", family="sans", cex.main=1.75)
}
fingerprints<-function(lanetable, sample_sets){
attach(lanetable)
#define layout
layout(matrix(c(1,2,3), ncol=1, nrow=3, byrow=TRUE), heights=c(1, 3,2), respect=FALSE)
#prep for title bar
title=paste(sample_sets, ": Fingerprint Status", sep="")
drop<-read.jpeg("adprdrop.jpg")
#plot title bar
par(mar=c(0,0,0,0))
plot(drop)
text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
#prep for FP plot
badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15))
colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS)))
colors[badsnps]<-"Red"
ticks<-c(match(unique(Flowcell), Flowcell) )
ys=rep(c(0, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, 0, 0), ceiling(length(ticks)/2))
shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5
if((length(ticks)%%2 > 0)){
shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5
}
shader<-na.omit(shader)
#plot FP plot
par(mar=c(10, 6, 8, 3))
plot(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=NA, ylim=c(0,24), ylab="Fingerprint calls", xlab="", xaxt="n", col=colors, main="Fingerprint Calling and Matching Sorted by Flowcell", cex.main=2)
axis(side=3, at=c(1:length(Flowcell)), labels=Lane[order(Flowcell)], cex.axis=0.5, padj=1,tick=FALSE)
axis(side=1, at=c(ticks), labels=sort(unique(Flowcell)), tick=FALSE, las=2)
mtext("Lane",side=3, cex=.75, line=1.5)
mtext("Flowcell",side=1, cex=1.25, line=8)
polygon(shader, ys, border="black", lty=0, col="gray")
points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=4, col=colors)
points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_CALLS, pch=3, col=colors)
if(length(badsnps)>0){
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes", "All Confident calls match fingerprint sites"), pch=c(4,3,4,3,8), col=c("Blue", "Blue", "Red", "Red", "Black" ), bg="White")
mtext("Some problematic fingerprint sites", side=3)
}else{
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "All Confident calls match fingerprint sites"), pch=c(4, 3, 8), col="Blue", bg="White")
}
#plot some summary of FP stuff
textplot("Some summary of Fingerprint problems will go here ", valign="top", family="sans")
detach(lanetable)
}
snps_called<-function(lanetable, sample_sets){
attach(lanetable)
#define layout for this page
layout(matrix(c(1,1,2, 3, 4,4), ncol=2, nrow=3, byrow=TRUE), widths = c(3,1), heights=c(1, 3,2), respect=FALSE)
#prep for title bar
title=paste(sample_sets, ": SNPs Called by Lane", sep="")
drop<-read.jpeg("adprdrop.jpg")
#plot title bar
par(mar=c(0,0,0,0))
plot(drop)
text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
#prep for snp plot
ticks<-c(match(unique(Flowcell), sort(Flowcell)) )
ys=rep(c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, min(SNP_TOTAL_SNPS, na.rm=TRUE), min(SNP_TOTAL_SNPS, na.rm=TRUE)), ceiling(length(ticks)/2))
shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5
if((length(ticks)%%2 > 0)){
shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5
}
shader<-na.omit(shader)
cols<-rep("blue", length(SNP_TOTAL_SNPS))
cols[which(SNP_TOTAL_SNPS %in% boxplot.stats(SNP_TOTAL_SNPS)$out)]<-"red"
#plot snp plot
par(ylog=TRUE, mar=c(10, 6, 4, 0))
plot(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS[order(Flowcell)],xlab="",
ylab="SNPs Called",
ylim = c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)),
xaxt="n",
pch=NA)
title(main="SNPs Called in Each Lane sorted by Flowcell", line=3, cex=1.5)
axis(side=3, at=c(1:length(Flowcell)), labels=Lane[order(Flowcell)], cex.axis=0.5, padj=1,tick=FALSE)
axis(side=1, at=c(ticks), labels=sort(unique(Flowcell)), tick=FALSE, las=2)
mtext("Lane",side=3, cex=.75, line=1.5)
mtext("Flowcell",side=1, cex=1.25, line=8)
polygon(shader, ys, border="black", lty=0, col="gray")
points(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS, col=cols, pch=19)
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)>0){
legend("topright", legend=c("Normal SNP Call Counts", "Outlier SNP Call Counts"), pch=19, col=c("Blue", "red"), bg="White")
}
#plot boxplot
par(ylog=TRUE, mar=c(10, 0, 4, 2))
boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="", yaxt="n", ylim = c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)), ylog=TRUE)
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){
mtext("No outliers", side=1, line=4)
}else{
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
}
#Plot variant summary below
textplot("Variant Summary will go here", valign="top", family="sans")
detach(lanetable)
}
titvsamp<-function(metricsbysamp){
attach(titv)
#define layout
layout(matrix(c(1,2,3), ncol=1, nrow=3, byrow=TRUE), heights=c(1, 3,2), respect=FALSE)
#prep for title bar
title=paste(sample_sets, ": Ti/Tv Ratio by Sample", sep="")
drop<-read.jpeg("adprdrop.jpg")
#plot title bar
par(mar=c(0,0,0,0))
plot(drop)
text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
#prep for titv graph
boxplot.stats(TiTvRatio[which(filter_name=="filtered")])$stats[5]->min
shade<-which(sort(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE)<min)-.5
#plot titv graph
par(mar=c(9, 5, 4, 2))
plot(seq(1:length(unique(row))), sort(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE),
xaxt="n",
main="Ti/Tv for Novel and Known SNP calls",
ylab="Ti/Tv",
xlab="",
col="red",
cex.main=2,
cex.lab=1.25,
cex.axis=1,
pch=1)
polygon(c(min(shade), min(shade), max(shade)+5, max(shade)+5, min(shade)), c(par()$xaxp[1:2], par()$xaxp[2:1], par()$xaxp[1]), col="gray", lty=0)
points(seq(1:length(unique(row))), c(TiTvRatio[which(novelty_name=="known" & filter_name=="called")])[order(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE)], pch=1, col="blue")
axis(side=1, at=c(1:length(unique(row))), labels=unique(row)[order(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE)], tick=FALSE, hadj=1, las=2, cex=1.25)
mtext("Samples Sorted by Novel Ti/Tv", side=1, cex=1., line = 6)
abline(a=mean(TiTvRatio[which(novelty_name=="all" & filter_name=="called")]),b=0)
if(length(shade)<1){
legend("topright", legend=c("Known Variants", "Novel Variants", "Mean Ti/Tv for all variants"), col=c("blue", "red", "black"), pch=c(1,1,NA), lty=c(0, 0, 1), xjust=0.5, cex=1.25, adj=c(-20, 0))
}else{
points(shade+.5, sort(TiTvRatio[which(novelty_name=="novel" & filter_name=="called")], decreasing=TRUE)[shade], pch=4, col="red")
legend("top", legend=c("Known Variants", "Novel Variants (normal values)", "Novel Variants (low values)","Mean Ti/Tv for all called variants"), col=c("blue", "red", "red", "black"), pch=c(1,1,4,NA), lty=c(0, 0, 0, 1), xjust=0.5, bty="n", cex=1.25, adj=c(0, 0))
}
#Plot variant summary below
par(mar=c(2, 2, 2, 2))
textplot("Lower TiTv indicates potentially higher false positive rates.\nTi/Tv ratios within the 95% confidence interval of the distribution of Ti/Tv ratios for filtered calls are indicated by gray shading.\nSomething Else will go here too", valign="top", family="sans")
detach(titv)
}
#functionalclasses<-function(countfunctclasses){}
errorratepercycle<-function(erpc){
#define layout
layout(matrix(c(1,2,3), ncol=1, nrow=3, byrow=TRUE), heights=c(1, 3,2), respect=FALSE)
#prep for title bar
title=paste(sample_sets, ": Error Rate Per Cycle", sep="")
drop<-read.jpeg("adprdrop.jpg")
#plot title bar
par(mar=c(0,0,0,0))
plot(drop)
text(100, 40, title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
#prep for erprp graph
crazies<-which(errpercycle[nrow(errpercycle),]>0.3) #this can be changed to any kind of filter for particular lanes
colors<-rainbow(ncol(errpercycle), s=0.5, v=0.5)
colors[crazies]<-rainbow(length(crazies))
weights<-rep(1, ncol(errpercycle))
weights[crazies]<-2
#plot erprp graph
par(mar=c(6, 6, 3, 2))
matplot(errpercycle,
type="l",
lty="solid",
col=colors,
lwd=weights,
main="Error Rate per Read Position",
ylab="Error Rate",
xlab="Cycle/Read Position",
log="y",
cex.main=2,
cex.lab=1.5,
cex.axis=1.25,
)
if(length(crazies)>0){
legend("topleft", title="Unusual Lanes", legend=colnames(errpercycle)[crazies], lty="solid", lwd=2, col=colors[crazies], xjust=0.5)
}else{
mtext("No unusual lanes.", 1, line=6, cex=1.25)
}
#Plot variant summary below
textplot("Something related will go here", valign="top", family="sans")
}
depth_target<-function(DOC){
#define layout
layout(matrix(c(1,2), ncol=1, nrow=2, byrow=TRUE), heights=c(1, 5), respect=FALSE)
#prep for title bar
title=paste(sample_sets, ": Depth of Coverage By Target", sep="")
drop<-read.jpeg("adprdrop.jpg")
#plot title bar
par(mar=c(0,0,0,0))
plot(drop)
text(100, 40, title, family="serif", adj=c(0,0), cex=1.75, col=gray(.25))
colnames(DOC)->cols
apply(DOC[,grep("mean", cols)], 1, median)->medianofmeans
apply(DOC[,grep("mean", cols)], 1, quantile, probs=3/4)->q3s
apply(DOC[,grep("mean", cols)], 1, quantile, probs=1/4)->q1s
par(ylog=FALSE, mar=c(5, 5, 4, 2))
plot(c(1:3122),sort(medianofmeans, decreasing=TRUE), type="l",log="y",ylab="Coverage", xlab="",xaxt="n", main="Coverage Across All Targets", lwd=2, cex.main=2.5, cex.lab=1.5, cex.axis=1.25)
mtext("Targets sorted by median avereage coverage across sample", side=1, line=1, cex=1.5)
abline(h=10, lty="dashed", lwd=3)
lines(c(1:3122),q3s[order(medianofmeans, decreasing=TRUE)], col="dark blue")
lines(c(1:3122),q1s[order(medianofmeans, decreasing=TRUE)], col="dark blue")
legend(c(0, 20), legend="10x coverage", box.lty=0, lwd=3, lty="dashed")
legend("bottomleft", legend=c("Median average target coverage across all samples", "First and third quartiles of average target across all sample"), box.lty=0, lwd=c(1,2), col=c("black", "dark blue"), lty="solid")
#define layout
layout(matrix(c(1,2), ncol=1, nrow=2, byrow=TRUE), heights=c(1,5), respect=FALSE)
#prep for title bar
title=paste(sample_sets, ": Depth of Coverage For Poorly Covered Targets", sep="")
drop<-read.jpeg("adprdrop.jpg")
#plot title bar
par(mar=c(0,0,0,0))
plot(drop)
text(100, 40, title, family="serif", adj=c(0,0), cex=1.25, col=gray(.25))
yuck<-DOC[which(medianofmeans<10),grep("mean", cols)]
yuck<-yuck+0.1
par(mar=c(17, 4, 4, 2))
boxplot(t(yuck[order(medianofmeans[which(medianofmeans<10)], decreasing=TRUE),]),log="y", yaxt="n", xaxt="n", cex.lab=1.15, cex.axis=1.05, ylab="Average coverage accross all samples", main="Targets with low coverage accross samples")
axis(2, at=axTicks(2)+c(0, rep(0.1, length(axTicks(2))-1)), labels=c(0.0, axTicks(2)[2:length(axTicks(2))]), cex.axis=0.75)
mtext("Target", side=1, line=15, cex=1.5)
axis(1, at=c(1:length(which(medianofmeans<10))), labels=rownames(DOC[which(medianofmeans<10),])[order(medianofmeans[which(medianofmeans<10)])], las=2, cex.axis=1.15)
}
depth_sample<-function(DOC2){
#define layout
layout(matrix(c(1,2), ncol=1, nrow=2, byrow=TRUE), heights=c(1,5), respect=FALSE)
#prep for title bar
title=paste(sample_sets, ": Mean Depth of Coverage per Base by Sample", sep="")
drop<-read.jpeg("adprdrop.jpg")
#plot title bar
par(mar=c(0,0,0,0))
plot(drop)
text(100, 40, title, family="serif", adj=c(0,0), cex=1.25, col=gray(.25))
#prep for bysample
means<-c(sort(DOC2[which(DOC2[,2]<250),2]), rep(250, (length(which(DOC2[,2]>=250))-1)))
types<-rep(20, length(means))
cols<-rep("black", length(means))
types[which(means==250)]<-8
cols[which(means==250)]<-"red"
#plot doc by sample
par(mar=c(10, 4, 4, 2))
plot(means, ylim=c(0, 250), xaxt="n", col=cols, pch=types, xlab="", ylab="Depth of Coverage")
> axis(1, at=c(1:(nrow(DOC2)-1)), labels=c(rownames(DOC2[which(DOC2[,2]<250),])[order(DOC2[which(DOC2[,2]<250),2])], rownames(DOC2[which(DOC2[,2]>=250),])[order(which(DOC2[,2]>=250))][1:(length(which(DOC2[,2]>=250))-1)]), las=2)
> mtext("Samples", side=1, line=7, cex=1.25)
}
datapuller<-function(setname){
#library(yaml)
strsplit(setname, ".")[1]->projectname
lanes<-read.delim(paste(projectname, "_lanes.txt", sep=""), header=TRUE)
samps<-read.delim(paste(projectname, "_samps.txt", sep=""), header=TRUE)
#doct<-read.delim(paste(setname, "depth.sample_interval_summary", sep=""), header=TRUE, row.names=1)
#docs<-read.delim(paste(setname, ".depth.sample_summary", sep=""), header=TRUE, row.names=1)
#eval<-read.csv(paste(setname, "eval.CountFunctionalClasses", sep=""), skip=1)
titv<-read.csv(paste(setname, ".eval.SimpleMetricsBySample.csv", sep=""), skip=1)
#erprp<-read.delim(paste(setname, ".erprp", sep=""))
colnames(lanes)<-c('Initiative','Project','GSSR.ID','External.ID','WR.ID','Flowcell','Lane','Lane.Type','Library','AL_TOTAL_READS','AL_PF_READS','AL_PCT_PF_READS','AL_PF_NOISE_READS','AL_PF_READS_ALIGNED','AL_PCT_PF_READS_ALIGNED','AL_PF_HQ_ALIGNED_READS','AL_PF_HQ_ALIGNED_BASES','AL_PF_HQ_ALIGNED_Q20_BASES','AL_PF_HQ_MEDIAN_MISMATCHES','AL_MEAN_READ_LENGTH','AL_READS_ALIGNED_IN_PAIRS','AL_PCT_READS_ALIGNED_IN_PAIRS','AL_BAD_CYCLES','AL_PCT_STRAND_BALANCE','DUP_UNPAIRED_READS_EXAMINED','DUP_READ_PAIRS_EXAMINED','DUP_UNMAPPED_READS','DUP_UNPAIRED_READ_DUPLICATES','DUP_READ_PAIR_DUPLICATES','DUP_PERCENT_DUPLICATION','DUP_ESTIMATED_LIBRARY_SIZE','HS_BAIT_SET','HS_GENOME_SIZE','HS_LIBRARY_SIZE','HS_BAIT_TERRITORY','HS_TARGET_TERRITORY','HS_BAIT_DESIGN_EFFICIENCY','HS_TOTAL_READS','HS_PF_READS','HS_PF_UNIQUE_READS','HS_PCT_PF_READS','HS_PCT_PF_UQ_READS','HS_PCT_PF_UQ_READS_ALIGNED','HS_PF_UQ_READS_ALIGNED','HS_PF_UQ_BASES_ALIGNED','HS_ON_BAIT_BASES','HS_NEAR_BAIT_BASES','HS_OFF_BAIT_BASES','HS_ON_TARGET_BASES','HS_PCT_SELECTED_BASES','HS_PCT_OFF_BAIT','HS_ON_BAIT_VS_SELECTED','HS_MEAN_BAIT_COVERAGE','HS_MEAN_TARGET_COVERAGE','HS_FOLD_ENRICHMENT','HS_ZERO_CVG_TARGETS_PCT','HS_FOLD_80_BASE_PENALTY','HS_PCT_TARGET_BASES_2X','HS_PCT_TARGET_BASES_10X','HS_PCT_TARGET_BASES_20X','HS_PCT_TARGET_BASES_30X','HS_PENALTY_10X','HS_PENALTY_20X','HS_PENALTY_30X','SNP_TOTAL_SNPS','SNP_PCT_DBSNP','SNP_NUM_IN_DBSNP','Lane.IC.Matches','Lane.IC.PCT.Mean.RD1.Err.Rate','Lane.IC.PCT.Mean.RD2.Err.Rate','FP_PANEL_NAME','FP_PANEL_SNPS','FP_CONFIDENT_CALLS','FP_CONFIDENT_MATCHING_SNPS','FP_CONFIDENT_CALLED_PCT','FP_CONFIDENT_MATCHING_SNPS_PCT','LPCNCRD_REFERENCE','LPCNCRD_NON_REFERENCE','LPCNCRD_PCT_CONCORDANCE')
files<-list(c(lanes, samps, doct, docs, eval, titv, erprp))
return(files)
}
runner<-function(basename, desc1, desc2){
datapuller(basename)->tables
attach(tables)
pdf(paste(basename, ".pdf", sep=""), width=22, height=15,pointsize=24)
tearsheet(lanes, samps, titv, desc1, desc1)
fingerprints(lanes)
snps_called(lanes)
titvsamp(titv)
#functionalclasses(eval)
#errorratepercycle(erprp)
#depth_target(doct)
#depth_sample(docs)
dev.off()
detach(tables)
}
if(length(commandArgs(TRUE))>0){
runner(commandArgs(TRUE))
}

View File

@ -1,325 +0,0 @@
#Before executing this file, save squid files as csv, then as tab deliminated files with only the column values as the header, change the format of all cells to numbers. Assign the path to these files to "samples" and "lanes" respectively.
#set up database stuff for firehose and picard interface
#set up so runnable by firehsoe
stuffmaker<-function(args){
lanes<-args[1]
samples<-args[2]
sample_sets<-args[3]
eval<-args[4]
titveval<-args[5]
DOCi<-args[6]
DOCs<-args[7]
library(gplots)
pdf(file=paste(sample_sets, ".pdf", sep=""), width=22, height=15, pagecentre=TRUE, pointsize=24)
if(is.na(sample_sets)){
print("Please specify sample set for file naming and press enter.")
scan("stdin", what="character",n=1)->sample_sets
print("Thanks!")
}
if(is.na(lanes) == FALSE && is.na(samples)==FALSE){
#this makes a table & graphs using Picard data
if(typeof(lanes)=="character"){
read.delim(file=lanes, header= TRUE)->bylane;
colnames(bylane)<-c('Initiative','Project','GSSR.ID','External.ID','WR.ID','Flowcell','Lane','Lane.Type','Library','AL_TOTAL_READS','AL_PF_READS','AL_PCT_PF_READS','AL_PF_NOISE_READS','AL_PF_READS_ALIGNED','AL_PCT_PF_READS_ALIGNED','AL_PF_HQ_ALIGNED_READS','AL_PF_HQ_ALIGNED_BASES','AL_PF_HQ_ALIGNED_Q20_BASES','AL_PF_HQ_MEDIAN_MISMATCHES','AL_MEAN_READ_LENGTH','AL_READS_ALIGNED_IN_PAIRS','AL_PCT_READS_ALIGNED_IN_PAIRS','AL_BAD_CYCLES','AL_PCT_STRAND_BALANCE','DUP_UNPAIRED_READS_EXAMINED','DUP_READ_PAIRS_EXAMINED','DUP_UNMAPPED_READS','DUP_UNPAIRED_READ_DUPLICATES','DUP_READ_PAIR_DUPLICATES','DUP_PERCENT_DUPLICATION','DUP_ESTIMATED_LIBRARY_SIZE','HS_BAIT_SET','HS_GENOME_SIZE','HS_LIBRARY_SIZE','HS_BAIT_TERRITORY','HS_TARGET_TERRITORY','HS_BAIT_DESIGN_EFFICIENCY','HS_TOTAL_READS','HS_PF_READS','HS_PF_UNIQUE_READS','HS_PCT_PF_READS','HS_PCT_PF_UQ_READS','HS_PCT_PF_UQ_READS_ALIGNED','HS_PF_UQ_READS_ALIGNED','HS_PF_UQ_BASES_ALIGNED','HS_ON_BAIT_BASES','HS_NEAR_BAIT_BASES','HS_OFF_BAIT_BASES','HS_ON_TARGET_BASES','HS_PCT_SELECTED_BASES','HS_PCT_OFF_BAIT','HS_ON_BAIT_VS_SELECTED','HS_MEAN_BAIT_COVERAGE','HS_MEAN_TARGET_COVERAGE','HS_FOLD_ENRICHMENT','HS_ZERO_CVG_TARGETS_PCT','HS_FOLD_80_BASE_PENALTY','HS_PCT_TARGET_BASES_2X','HS_PCT_TARGET_BASES_10X','HS_PCT_TARGET_BASES_20X','HS_PCT_TARGET_BASES_30X','HS_PENALTY_10X','HS_PENALTY_20X','HS_PENALTY_30X','SNP_TOTAL_SNPS','SNP_PCT_DBSNP','SNP_NUM_IN_DBSNP','Lane.IC.Matches','Lane.IC.PCT.Mean.RD1.Err.Rate','Lane.IC.PCT.Mean.RD2.Err.Rate','FP_PANEL_NAME','FP_PANEL_SNPS','FP_CONFIDENT_CALLS','FP_CONFIDENT_MATCHING_SNPS','FP_CONFIDENT_CALLED_PCT','FP_CONFIDENT_MATCHING_SNPS_PCT','LPCNCRD_REFERENCE','LPCNCRD_NON_REFERENCE','LPCNCRD_PCT_CONCORDANCE')
}else{
lanes->bylane
colnames(bylane)<-c('Initiative','Project','GSSR.ID','External.ID','WR.ID','Flowcell','Lane','Lane.Type','Library','AL_TOTAL_READS','AL_PF_READS','AL_PCT_PF_READS','AL_PF_NOISE_READS','AL_PF_READS_ALIGNED','AL_PCT_PF_READS_ALIGNED','AL_PF_HQ_ALIGNED_READS','AL_PF_HQ_ALIGNED_BASES','AL_PF_HQ_ALIGNED_Q20_BASES','AL_PF_HQ_MEDIAN_MISMATCHES','AL_MEAN_READ_LENGTH','AL_READS_ALIGNED_IN_PAIRS','AL_PCT_READS_ALIGNED_IN_PAIRS','AL_BAD_CYCLES','AL_PCT_STRAND_BALANCE','DUP_UNPAIRED_READS_EXAMINED','DUP_READ_PAIRS_EXAMINED','DUP_UNMAPPED_READS','DUP_UNPAIRED_READ_DUPLICATES','DUP_READ_PAIR_DUPLICATES','DUP_PERCENT_DUPLICATION','DUP_ESTIMATED_LIBRARY_SIZE','HS_BAIT_SET','HS_GENOME_SIZE','HS_LIBRARY_SIZE','HS_BAIT_TERRITORY','HS_TARGET_TERRITORY','HS_BAIT_DESIGN_EFFICIENCY','HS_TOTAL_READS','HS_PF_READS','HS_PF_UNIQUE_READS','HS_PCT_PF_READS','HS_PCT_PF_UQ_READS','HS_PCT_PF_UQ_READS_ALIGNED','HS_PF_UQ_READS_ALIGNED','HS_PF_UQ_BASES_ALIGNED','HS_ON_BAIT_BASES','HS_NEAR_BAIT_BASES','HS_OFF_BAIT_BASES','HS_ON_TARGET_BASES','HS_PCT_SELECTED_BASES','HS_PCT_OFF_BAIT','HS_ON_BAIT_VS_SELECTED','HS_MEAN_BAIT_COVERAGE','HS_MEAN_TARGET_COVERAGE','HS_FOLD_ENRICHMENT','HS_ZERO_CVG_TARGETS_PCT','HS_FOLD_80_BASE_PENALTY','HS_PCT_TARGET_BASES_2X','HS_PCT_TARGET_BASES_10X','HS_PCT_TARGET_BASES_20X','HS_PCT_TARGET_BASES_30X','HS_PENALTY_10X','HS_PENALTY_20X','HS_PENALTY_30X','SNP_TOTAL_SNPS','SNP_PCT_DBSNP','SNP_NUM_IN_DBSNP','Lane.IC.Matches','Lane.IC.PCT.Mean.RD1.Err.Rate','Lane.IC.PCT.Mean.RD2.Err.Rate','FP_PANEL_NAME','FP_PANEL_SNPS','FP_CONFIDENT_CALLS','FP_CONFIDENT_MATCHING_SNPS','FP_CONFIDENT_CALLED_PCT','FP_CONFIDENT_MATCHING_SNPS_PCT','LPCNCRD_REFERENCE','LPCNCRD_NON_REFERENCE','LPCNCRD_PCT_CONCORDANCE')
}
if(typeof(samples)=="character"){
read.delim(file=samples, header= TRUE)->bysample;
}else{
samples->bysample
}
#Calc by lane metrics
sdlane<-rep("NA", 6)
meanlane<-sdlane
attach(bylane);
callable.target<-HS_TARGET_TERRITORY[1];
singlelanes<-length(which(Lane.Type=="Single"));
pairedlanes<-length(which(Lane.Type=="Paired"));
meanlane[1]<-round(mean(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2);
sdlane[1]<-round(sd(AL_TOTAL_READS, na.rm=TRUE)/10^6, 2);
meanlane[2]<-round(mean(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2);
sdlane[2]<-round(sd(HS_ON_TARGET_BASES, na.rm=TRUE)/10^6, 2);
meanlane[3]<-round(mean(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
sdlane[3]<-round(sd(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
meanlane[4]<-round(mean(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
meanlane[5]<-round(mean(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
meanlane[6]<-round(mean(HS_PCT_TARGET_BASES_30X, na.rm=TRUE));
sdlane[4]<-round(sd(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
sdlane[5]<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
sdlane[6]<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE))
names<-paste(Flowcell, "-", Lane, sep="")
#makes a plot of the number of SNPS called per lane
ticks<-c(match(unique(Flowcell), sort(Flowcell)) )
ys=rep(c(min(SNP_TOTAL_SNPS, na.rm=TRUE)*0.96, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, max(SNP_TOTAL_SNPS, na.rm=TRUE)*1.04, min(SNP_TOTAL_SNPS, na.rm=TRUE)*0.96, min(SNP_TOTAL_SNPS, na.rm=TRUE)*0.96), ceiling(length(ticks)/2))
defaults<-par(no.readonly = TRUE)
layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE)
par(mar=c(10, 6, 4, 8))
plot(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS[order(Flowcell)],xlab="", ylab="SNPs Called in Lane", ylim = c(min(SNP_TOTAL_SNPS, na.rm=TRUE), max(SNP_TOTAL_SNPS, na.rm=TRUE)), xaxt="n", pch=NA)
title(main=paste(sample_sets, ": SNPs Called in Each Lane sorted by Flowcell", sep=""), line=3, cex=1.25)
axis(side=3, at=c(1:length(Flowcell)), labels=Lane[order(Flowcell)], cex.axis=0.5, padj=1,tick=FALSE)
axis(side=1, at=c(ticks), labels=sort(unique(Flowcell)), tick=FALSE, las=2)
mtext("Lane",side=3, cex=.75, line=1.5)
mtext("Flowcell",cex=.75,side=1, line=8)
shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5
if((length(ticks)%%2 > 0)){
shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5
}
shader<-na.omit(shader)
polygon(shader, ys, border="black", lty=0, col="gray")
cols<-rep("blue", length(SNP_TOTAL_SNPS))
cols[which(SNP_TOTAL_SNPS %in% boxplot.stats(SNP_TOTAL_SNPS)$out)]<-"red"
points(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS, col=cols, pch=19)
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)>0){
legend("topright", legend=c("Normal SNP Call Counts", "Outlier SNP Call Counts"), pch=19, col=c("Blue", "red"), bg="White")
}
boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called" )
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){
mtext("No outliers", side=1, line=4)
}else{
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
}
#makes a plot of fingerprint calls and labels them good or bad
par(defaults)
badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15))
colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS)))
colors[badsnps]<-"Red"
ticks<-c(match(unique(Flowcell), Flowcell) )
ys=rep(c(0, 24*1.04, 24*1.04, 0, 0), ceiling(length(ticks)/2))
#pdf(file=paste(sample_sets, "_Fingerprints.pdf", sep=""), width=.2*length(FP_CONFIDENT_CALLS), height=.1*length(FP_CONFIDENT_CALLS))
par(mar=c(10, 6, 8, 3))
plot(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=NA, ylim=c(0,24), ylab="Fingerprint calls", xlab="", xaxt="n", col=colors, main="Fingerprint Calling and Matching Sorted by lane")
axis(side=1, at=(ticks+1), labels=unique(Flowcell), tick=FALSE, hadj=1, las=2)
shader<-ticks[c(rep(c(1,1,2,2,1), ceiling(length(ticks)/2))+sort(rep(seq(0, length(ticks),by=2), 5)))]-0.5
shader<-na.omit(shader)
if((length(ticks)%%2 > 0)){
shader[(length(shader)-2):(length(shader)-1)]<-length(Flowcell)+0.5
}
polygon(shader, ys, border="black", lty=0, col="gray")
points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=4, col=colors)
points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_CALLS, pch=3, col=colors)
if(length(badsnps)>0){
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes", "All Confident calls match fingerprint sites"), pch=c(4,3,4,3,8), col=c("Blue", "Blue", "Red", "Red", "Black" ), bg="White")
mtext("Some problematic fingerprint sites", side=3)
}else{
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "All Confident calls match fingerprint sites"), pch=c(4, 3, 8), col=c("Blue", "Blue", "Black"), bg="White")
}
detach(bylane)
}else{
print("Lane and Sample metrics file paths not provided")
}
meansamp<-rep("NA", 6)
sdsamp<-meansamp
#Calc by sample metrics
attach(bysample);
mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE));
sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE));
mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE));
sd.mrl.samp<-signif(sd(Mean.Read.Length, na.rm=TRUE));
meansamp[1]<-round(mean(Total.Reads, na.rm=TRUE)/10^6, 2);
sdsamp[1]<-round(sd(Total.Reads, na.rm=TRUE)/10^6, 2);
meansamp[2]<-round(mean(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2);
sdsamp[2]<-round(sd(On.Target.Bases..HS., na.rm=TRUE)/10^6, 2);
meansamp[3]<-round(mean(Mean.Target.Coverage..HS., na.rm=TRUE));
sdsamp[3]<-round(sd(Mean.Target.Coverage..HS., na.rm=TRUE));
meansamp[4]<-round(mean(PCT.Target.Bases.10x..HS., na.rm=TRUE));
meansamp[5]<-round(mean(PCT.Target.Bases.20x..HS., na.rm=TRUE));
meansamp[6]<-round(mean(PCT.Target.Bases.30x..HS., na.rm=TRUE));
sdsamp[4]<-round(sd(PCT.Target.Bases.10x..HS., na.rm=TRUE));
sdsamp[5]<-round(sd(PCT.Target.Bases.20x..HS., na.rm=TRUE));
sdsamp[6]<-round(sd(PCT.Target.Bases.30x..HS., na.rm=TRUE));
detach(bysample);
#print all of this stuff out in R.
summary<-c(paste(callable.target, "bases"), paste(mean.lanes.samp, "+/-", sd.lanes.samp), paste(singlelanes, "single lanes,", pairedlanes, "paired lanes"), paste(mean.mrl.samp, "+/-", sd.mrl.samp))
samps<-paste(meansamp, c("M", "M", "x", "%", "%", "%"), " +/- ", sdsamp, c("M", "M", "x", "%", "%", "%"), sep="")
lanes<-paste(meanlane, c("M", "M", "x", "%", "%", "%"), " +/- ", sdlane, c("M", "M", "x", "%", "%", "%"), sep="")
layout(matrix(c(1,2), ncol=1), heights=c(2,3))
table1<-cbind(summary)
rownames(table1)<-c("Callable Target", "Used Lanes per Sample", "Parities", "Read Length")
textplot(table1, col.rownames="blue", show.colnames=FALSE, cex=1.75)
title(main="Sequencing Summary", family="serif", cex.main=2)
table2<-cbind(lanes, samps)
colnames(table2)<-c("per lane", "per sample")
rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 10x")
textplot(table2, rmar=1, col.rownames="blue", cex=1.25)
title(main="Bases Summary", family="serif", cex.main=1.75)
par(defaults)
#Makes Error Rate percycle graph
if(is.na(eval)==FALSE){
if(typeof(eval)=="character"){
read.delim(eval, header=TRUE)[2:ncol(read.delim(eval, header=TRUE))]->errpercycle
}else{
eval->errpercycle
}
#pdf(paste(sample_sets, "_errorrate_per_cycle.pdf", sep=""), width=6, height=5)
crazies<-which(errpercycle[75,]>0.3) #this can be changed to any kind of filter for particular lanes
colors<-rainbow(ncol(errpercycle), s=0.5, v=0.5)
colors[crazies]<-rainbow(length(crazies))
weights<-rep(1, ncol(errpercycle))
weights[crazies]<-2
matplot(errpercycle, type="l", lty="solid", col=colors, lwd=weights, main="Error Rate per Cycle", ylab="Error Rate", xlab="Cycle", ylim=c(0, 0.7))
if(length(crazies)>0){
legend("topleft", title="Unusual Lanes", legend=colnames(errpercycle)[crazies], lty="solid", lwd=2, col=colors[crazies], xjust=0.5)
}else{
legend("topleft", legend="No unusual lanes.", bty="n")
}
}else{
print("Error Rate Per Cycle file paths not provided")
}
#Makes TI/TV known v novel graph
if(is.na(titveval)==FALSE){
##TODO: need ot make sure this is nice and prettified.
titv<-read.csv(file=titveval, skip=1)
attach(titv)
#pdf(file=paste(sample_sets, "_TI-TV.pdf", sep=""), width=0.2*length(unique(sample)), height=0.175*length(unique(sample)))
par(mar=c(11, 4, 4, 2))
plot(seq(1:length(unique(sample))), Ti.Tv[which(novelty_name=="novel" & filter_name=="called")], xaxt="n", ylim=c(1, 4), main="Ti/Tv for Novel and Known SNP calls", ylab="Ti/Tv", xlab="", col="red", pch=1)
points(seq(1:length(unique(sample))), Ti.Tv[which(novelty_name=="known" & filter_name=="called")], pch=1, col="blue")
axis(side=1, at=(1:length(unique(sample))), labels=unique(sample), tick=FALSE, hadj=1, las=2)
abline(a=mean(Ti.Tv[which(novelty_name=="all" & filter_name=="called")]),b=0)
legend("bottomright", legend=c("Known Variants", "Novel Variants", "Mean Ti/Tv for all variants"), col=c("blue", "red", "black"), pch=c(1,1,NA_integer_), lty=c(0, 0, 1), xjust=0.5)
mtext(line=9,"Lower Ti/Tv ratios indicate potentially increased false positive SNP rates.", side=1)
}else{
print("TiTV filepath not provided")
}
#Make DOC graph
if(is.na(DOCi)==FALSE){
#pdf(paste(sample_set, "_DOCi.pdf", sep=""), width=6, height=5)
if(typeof(DOCi)=="character"){
as.data.frame(read.delim(DOCi))->DOC
}else{
DOCi->DOCdata
}
colnames(DOC)->cols
apply(DOC[,grep("mean", cols)], 1, median)->medianofmeans
apply(DOC[,grep("mean", cols)], 1, quantile, probs=3/4)->q3s
apply(DOC[,grep("mean", cols)], 1, quantile, probs=1/4)->q1s
par(ylog=FALSE, mar=c(5, 4, 4, 2))
plot(c(1:3122),sort(medianofmeans, decreasing=TRUE), type="l", lwd="1",log="y",ylab="Coverage", xlab="Targets sorted by median average coverage across sample",xaxt="n", main="Coverage Across All Targets")
abline(h=10, lty="dotted")
lines(c(1:3122),q3s[order(medianofmeans, decreasing=TRUE)])
lines(c(1:3122),q1s[order(medianofmeans, decreasing=TRUE)])
legend("bottomleft", "10x coverage", box.lty=0, lty="dotted")
#pdf(paste(sample_set, "_DOCiy.pdf", sep=""), width=6, height=5)
yuck<-DOC[which(medianofmeans<10),grep("mean", cols)]
yuck<-yuck+0.1
par(mar=c(16, 4, 4, 2))
boxplot(t(yuck[order(medianofmeans[which(medianofmeans<10)], decreasing=TRUE),]),log="y", yaxt="n", xaxt="n", ylab="Average coverage accross all samples", main="Targets with low coverage accross samples")
axis(2, at=axTicks(2)+c(0, rep(0.1, length(axTicks(2))-1)), labels=c(0.0, axTicks(2)[2:length(axTicks(2))]), cex.axis=0.75)
mtext("Target", side=1, line=14)
axis(1, at=c(1:length(which(medianofmeans<10))), labels=DOC[which(medianofmeans<10),1][order(medianofmeans[which(medianofmeans<10)])], las=2, cex.axis=0.75)
}else{
print("Depth of Coverage--intervals filepath not provided")
}
if(is.na(DOCs)==FALSE){
#pdf(paste(sample_set, "_DOCs.pdf", sep=""), width=6, height=5)
if(typeof(DOCs)=="character"){
as.data.frame(read.delim(DOCs))->DOC2
}else{
DOCs->DOCdata
}
par(mar=c(10, 4, 4, 2))
boxplot(t(DOC2[,2:ncol(DOC2)]+0.1), log="y", main="Depth of Coverage by Sample", xaxt="n", yaxt="n", ylab="Coverage")
axis(1, at=c(1:nrow(DOC2)), labels=DOC2[,1], las=2)
axis(2, at=axTicks(2)+c(0, rep(0.1, length(axTicks(2))-1)), labels=floor(c(0.0, axTicks(2)[2:length(axTicks(2))])))
labels=floor(c(0.0, axTicks(2)[2:length(axTicks(2))]))
mtext("Samples", side=1, line=9)
}else{
print("Depth of Coverage--samples filepath not provided")
}
dev.off()
}
if(length(commandArgs(TRUE))>0){
stuffmaker(commandArgs(TRUE))
}

View File

@ -1,366 +0,0 @@
##put titles/rownames left
##make titles blue
##decrease margins below titles
## put row names in black
##put background rows in.
##change layouts so that it looks better
##get sample numbers in correctly
.libPaths('/humgen/gsa-firehose2/pipeline/repositories/StingProduction/R/')
suppressMessages(library(gplots));
suppressMessages(library(ReadImages));
suppressMessages(library(gsalib));
suppressMessages(library(ROracle));
cmdargs = gsa.getargs(
list(
yaml = list(value=NA, doc="pipeline YAML file"),
bamlist = list(value=NA, doc="list of BAM files"),
evalroot = list(value=NA, doc="VariantEval file"),
tearout = list(value=NA, doc="Output path for tearsheet PDF")#,
plotout = list(value=NA, doc="Output path for PDF")
),
doc="Creates a tearsheet"
);
bamlist = scan(cmdargs$bamlist, "character");
squids <- system(paste("grep SQUID ", cmdargs$yaml, ' |grep "C..." -o', sep=""), intern=TRUE)
indexed = c();
nonindexed = c();
for (bam in bamlist) {
bamheader = system(paste("samtools view -H", bam), intern=TRUE);
if (length(bamheader) > 0) {
rgs = bamheader[grep("^@RG", bamheader)];
for (rg in rgs) {
id = grep("PU:", unlist(strsplit(rg, "\t")), value=TRUE);
id = sub("PU:", "", id);
id = gsub("XX......", "XX", id)
if (length(unlist(strsplit(id, "\\.")))==3){
indexed<-c(indexed, id)
}
else{
if(length(unlist(strsplit(id, "\\.")))==2){
nonindexed<-c(nonindexed, id)
}
else{
print(id + " is a strange PU and will result in odd searches")
}
}
}
} else {
print(sprintf("Could not load '%s'\n", bam));
}
}
drv = dbDriver("Oracle");
con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD");
rs = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS"));
d = fetch(rs, n=-1);
dbHasCompleted(rs);
dbClearResult(rs);
rs2 = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG"));
d2 = fetch(rs2, n=-1);
dbHasCompleted(rs2);
dbClearResult(rs2);
oraCloseDriver(drv);
squid_fclanes = sprintf("%s.%s", d$"Flowcell", d$"Lane");
squid_fclanes_indexed = sprintf("%s.%s.%s", d$"Flowcell", d$"Lane", d$"Barcode");
dproj = d[which(squid_fclanes %in% nonindexed),];
dproj = rbind(dproj, d[which(squid_fclanes_indexed %in% indexed),])
dproj = dproj[which(dproj$"Project" %in% unique(squids)),]
d2proj = d2[which(d2$"Project" %in% unique(dproj$Project) & d2$"Sample" %in% dproj$"External ID"),];
tearsheet<-function(){
tearsheetdrop <- "~Documents/Sting/R/gsalib/data/tearsheetdrop.jpg" #put the path to the tearsheet backdrop here
pdf(file= cmdargs$tearout, width=22, height=17, pagecentre=TRUE, pointsize=24)
#define layout
postable<-matrix(c(1, 1, 1, 1, 1, 1, rep(c(2, 2, 2, 4, 4, 4), 5), rep(c(3, 3, 3, 4, 4, 4), 3), rep(c(3,3,3,5,5,5), 5), 6,6,6,7,7,7), nrow=15, ncol=6, byrow=TRUE)
layout(postable, heights=c(1, rep(.18, 13), 2), respect=FALSE)
#prep for title bar
drop<-read.jpeg(system.file(tearsheetdrop, package="gsalib"))
#plot title bar
par(mar=c(0,0,0,0))
plot(drop)
text(155, 50, "testing", family="serif", adj=c(0,0), cex=3, col=gray(.25))
# Project summary
projects = paste(unique(dproj$"Project"), collapse=", ");
used_samples = length(bamlist);
unused_samples = 0;
sequencing_protocol = "Hybrid selection"; #can this be extracted?
bait_design = paste(dimnames(table(dproj$"Bait Set"))[[1]][order(table(dproj$"Bait Set"), decreasing=TRUE)], collapse=", ");
if(nchar(bait_design)>50){
bait_design<-strsplit(bait_design, ", ")[[1]][1]
}
if(nchar(bait_design)>50){
bait_design<-strsplit(bait_design, ".Homo")[[1]][1]
}
callable_target = paste(na.omit(unique(dproj$"Target Territory")), collapse=", ");
table1<-rbind(paste(used_samples," used samples/", unused_samples + used_samples," total samples", sep=""), sequencing_protocol, bait_design, callable_target)
rownames(table1)<-c("Samples","Sequencing Protocol", "Bait Design","Callable Target")
par(mar=c(0,0,1,0))
textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.25, valign="top")
title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.25, line=-1)
# Bases summary
reads_per_lane_mean = format(mean(dproj$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE);
reads_per_lane_sd = format(sd(dproj$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE);
lanes<-sprintf("%s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd)
used_bases_per_lane_mean = format(mean(dproj$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE);
used_bases_per_lane_sd = format(sd(dproj$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE);
lanes<-c(lanes, sprintf("%s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd));
target_coverage_mean = mean(na.omit(dproj$"Mean Target Coverage"));
target_coverage_sd = sd(na.omit(dproj$"Mean Target Coverage"));
lanes<-c(lanes, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd));
pct_loci_gt_10x_mean = mean(na.omit(dproj$"Target Bases 10x %"));
pct_loci_gt_10x_sd = sd(na.omit(dproj$"Target Bases 10x %"));
lanes<-c(lanes, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd));
pct_loci_gt_20x_mean = mean(na.omit(dproj$"Target Bases 20x %"));
pct_loci_gt_20x_sd = sd(na.omit(dproj$"Target Bases 20x %"));
lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd));
pct_loci_gt_30x_mean = mean(na.omit(dproj$"Target Bases 30x %"));
pct_loci_gt_30x_sd = sd(na.omit(dproj$"Target Bases 30x %"));
lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));
reads_per_sample_mean = format(mean(d2proj$"PF Reads", na.rm=TRUE), 8, 3,1, scientific=TRUE);
reads_per_sample_sd = format(sd(d2proj$"PF Reads",na.rm=TRUE), 8, 3,1, scientific=TRUE);
samps<-sprintf("%s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd);
used_bases_per_sample_mean = format(mean(d2proj$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE);
used_bases_per_sample_sd = format(sd(d2proj$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE);
samps<-c(samps, sprintf("%s +/- %s\n", used_bases_per_sample_mean, used_bases_per_sample_sd));
target_coverage_mean = mean(na.omit(d2proj$"Mean Target Coverage"));
target_coverage_sd = sd(na.omit(d2proj$"Mean Target Coverage"));
samps<-c(samps, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd));
pct_loci_gt_10x_mean = mean(na.omit(d2proj$"Target Bases 10x %"));
pct_loci_gt_10x_sd = sd(na.omit(d2proj$"Target Bases 10x %"));
samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd));
pct_loci_gt_20x_mean = mean(na.omit(d2proj$"Target Bases 20x %"));
pct_loci_gt_20x_sd = sd(na.omit(d2proj$"Target Bases 20x %"));
samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd));
pct_loci_gt_30x_mean = mean(na.omit(d2proj$"Target Bases 30x %"));
pct_loci_gt_30x_sd = sd(na.omit(d2proj$"Target Bases 30x %"));
samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));
table2<-cbind(lanes, samps)
colnames(table2)<-c("Per lane", "Per sample")
rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 30x")
par(mar=c(0,0,1,0))
textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25, valign="top")
title(main="Bases Summary", family="sans", cex.main=1.25, line=0)
# Sequencing summary
instrument <- c();
if(length(grep("AAXX", dproj$Flowcell))>0){
instrument <- c(instrument, "Illumina GA2")
}
if(length(grep("ABXX", dproj$Flowcell))>0){
instrument <- c(instrument, "Illumina HiSeq")
}
if(length(instrument)>1){
instrument<-paste(instrument[1], instrument[2], sep=" and ")
}
used_lanes = nrow(dproj);
unused_lanes_by_sequencing = 0; #can we get this?
unused_lanes_by_analysis = 0;
lanes_per_sample_mean = mean(table(dproj$"External ID"), na.rm=TRUE);
lanes_per_sample_sd = sd(table(dproj$"External ID"), na.rm=TRUE);
lanes_per_sample_median = median(table(dproj$"External ID"));
lanes_paired = nrow(subset(dproj, dproj$"Lane Type" == "Paired"));
lanes_widowed = nrow(subset(dproj, dproj$"Lane Type" == "Widowed"));
lanes_single = nrow(subset(dproj, dproj$"Lane Type" == "Single"));
read_length_mean = mean(dproj$"Mean Read Length (P)");
read_length_sd = sd(dproj$"Mean Read Length (P)");
read_length_median = median(dproj$"Mean Read Length (P)");
date = dproj$"Run Date";
# date = sub("JAN", "01", date);
# date = sub("FEB", "02", date);
# date = sub("MAR", "03", date);
# date = sub("APR", "04", date);
# date = sub("MAY", "05", date);
# date = sub("JUN", "06", date);
# date = sub("JUL", "07", date);
# date = sub("AUG", "08", date);
# date = sub("SEP", "09", date);
# date = sub("OCT", "10", date);
# date = sub("NOV", "11", date);
# date = sub("DEC", "12", date);
date = date[order(as.Date(date, format="%d-%m-%Y"))];
start_date = date[1];
end_date = date[length(date)];
table3<-rbind(paste(instrument), used_lanes, sprintf("%s rejected by sequencing, %s by analysis\n", unused_lanes_by_sequencing, unused_lanes_by_analysis), sprintf("%0.1f +/- %0.1f lanes (median=%0.1f)\n", lanes_per_sample_mean, lanes_per_sample_sd, lanes_per_sample_median), sprintf("%s paired, %s widowed, %s single\n", lanes_paired, lanes_widowed, lanes_single), sprintf("%0.1f +/- %0.1f bases (median=%0.1f)\n", read_length_mean, read_length_sd, read_length_median), sprintf("\tSequencing dates: %s to %s\n", start_date, end_date))
rownames(table3)<-c("Sequencer", "Used lanes", "Unused lanes","Used lanes/sample", "Lane parities", "Read lengths", "Sequencing dates")
par(mar=c(0,0,1,0))
textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25, valign="top")
title(main="Sequencing Summary", family="sans", cex.main=1.25, line=0)
eval = gsa.read.gatkreport(cmdargs$evalroot)
# Variant summary
##TODO: Fix this csv reader
eval.counts = eval$CountVariants
eval.counts.all = subset(eval.counts, Novelty == "all")$nVariantLoci;
eval.counts.known = subset(eval.counts, Novelty == "known")$nVariantLoci;
eval.counts.novel = subset(eval.counts, Novelty == "novel")$nVariantLoci;
eval.titv = eval$TiTvVariantEvaluator
eval.titv.all = subset(eval.titv, Novelty == "all")$tiTvRatio;
eval.titv.known = subset(eval.titv, Novelty == "known")$tiTvRatio;
eval.titv.novel = subset(eval.titv, Novelty == "novel")$tiTvRatio;
table4 = matrix(c(eval.counts.all, eval.counts.known, eval.counts.novel, eval.titv.all, eval.titv.known, eval.titv.novel, "3.0 - 3.2", "3.2 - 3.4", "2.7 - 3.0"), nrow=3);
rownames(table4) = c("All", "Known", "Novel");
colnames(table4) = c("Found", "Ti/Tv ratio", "Expected Ti/Tv ratio");
par(mar=c(0,0,0,0))
textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25, valign="top")
title(main="Variant Summary", family="sans", cex.main=1.25, line=-2)
#
# #plots
# #fix this reader
# eval.bysample = read.csv(paste(cmdargs$evalroot, ".SimpleMetricsBySample.csv", sep=""), header=TRUE, comment.char="#");
# eval.bysample.called = subset(eval.bysample, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
# eval.bysample.all = subset(eval.bysample.called, novelty_name == "all");
# eval.bysample.known = subset(eval.bysample.called, novelty_name == "known");
# eval.bysample.novel = subset(eval.bysample.called, novelty_name == "novel");
eval.ac = eval$SimpleMetricsByAC.metrics
eval.ac.all = subset(eval.ac, Novelty == "all");
eval.ac.known = subset(eval.ac, Novelty == "known");
eval.ac.novel = subset(eval.ac, Novelty == "novel");
#
# eval.func = read.csv(paste(cmdargs$evalroot, ".Functional_Class_Counts_by_Sample.csv", sep=""), header=TRUE, comment.char="#");
# eval.func.called = subset(eval.func, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
# eval.func.all = subset(eval.func.called, novelty_name == "all");
# eval.func.known = subset(eval.func.called, novelty_name == "known");
# eval.func.novel = subset(eval.func.called, novelty_name == "novel");
#boxplot(eval.bysample.all$CountVariants, eval.bysample.known$CountVariants, eval.bysample.novel$CountVariants, names=c("All", "Known", "Novel"), ylab="Variants per sample", main="", cex=1.3, cex.lab=1.3, cex.axis=1.3);
# par(mar=c(5, 4, 4, 2) + 0.1)
# ind = order(eval.bysample.all$CountVariants);
# plot(c(1:length(eval.bysample.all$CountVariants)), eval.bysample.all$CountVariants[ind], col="black", cex=1.1, cex.lab=1.1, cex.axis=1.1, main="Variants per Sample", xlab="Sample", ylab="Number of variants", bty="n", ylim=c(0, max(eval.bysample.all$CountVariants)));
# points(c(1:length(eval.bysample.known$CountVariants)), eval.bysample.known$CountVariants[ind], col="blue", cex=1.3);
# points(c(1:length(eval.bysample.novel$CountVariants)), eval.bysample.novel$CountVariants[ind], col="red", cex=1.3);
# legend("right", max(eval.bysample.all$CountVariants)/2, c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21);
par(mar=c(5, 4, 4, 2) + 0.1)
plot(eval.ac.all$AC, eval.ac.all$n, col="black", type="l", lwd=2, cex=1.1, cex.lab=1.1, cex.axis=1.1, xlab="Allele count", ylab="Number of variants", main="Variants by Allele Count", log="xy", bty="n");
points(eval.ac.known$AC, eval.ac.known$n, col="blue", type="l", lwd=2);
points(eval.ac.novel$AC, eval.ac.novel$n, col="red", type="l", lwd=2);
legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2);
#plot(eval.func.all$Synonymous[ind] / (eval.func.all$Missense + eval.func.all$Nonsense)[ind], ylim=c(0, 2), cex=1.3, cex.lab=1.3, cex.axis=1.3, bty="n", xlab="Sample", ylab="Ratio of synonymous to non-synonymous variants", col="black");
#points(eval.func.known$Synonymous[ind] / (eval.func.known$Missense + eval.func.known$Nonsense)[ind], cex=1.3, col="blue");
#points(eval.func.novel$Synonymous[ind] / (eval.func.novel$Missense + eval.func.novel$Nonsense)[ind], cex=1.3, col="red");
#legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21);
dev.off()
}
tearsheet()
# Plots
plots<-function(){
# eval.bysample = read.csv(paste(cmdargs$evalroot, ".SimpleMetricsBySample.csv", sep=""), header=TRUE, comment.char="#");
# eval.bysample.called = subset(eval.bysample, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
# eval.bysample.all = subset(eval.bysample.called, novelty_name == "all");
# eval.bysample.known = subset(eval.bysample.called, novelty_name == "known");
# eval.bysample.novel = subset(eval.bysample.called, novelty_name == "novel");
eval.ac = eval$SimpleMetricsByAC.metrics
eval.ac.all = subset(eval.ac.called, Novelty == "all");
eval.ac.known = subset(eval.ac.called, Novelty == "known");
eval.ac.novel = subset(eval.ac.called, Novelty == "novel");
#
# eval.func = read.csv(paste(cmdargs$evalroot, ".Functional_Class_Counts_by_Sample.csv", sep=""), header=TRUE, comment.char="#");
# eval.func.called = subset(eval.func, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
# eval.func.all = subset(eval.func.called, novelty_name == "all");
# eval.func.known = subset(eval.func.called, novelty_name == "known");
# eval.func.novel = subset(eval.func.called, novelty_name == "novel");
pdf(file= cmdargs$plotout, width=22, height=17, pagecentre=TRUE, pointsize=24)
#
# boxplot(eval.bysample.all$CountVariants, eval.bysample.known$CountVariants, eval.bysample.novel$CountVariants, names=c("All", "Known", "Novel"), ylab="Variants per sample", main="", cex=1.3, cex.lab=1.3, cex.axis=1.3);
#
# ind = order(eval.bysample.all$CountVariants);
# plot(c(1:length(eval.bysample.all$CountVariants)), eval.bysample.all$CountVariants[ind], col="black", cex=1.3, cex.lab=1.3, cex.axis=1.3, xlab="Sample", ylab="Number of variants", bty="n", ylim=c(0, max(eval.bysample.all$CountVariants)));
# points(c(1:length(eval.bysample.known$CountVariants)), eval.bysample.known$CountVariants[ind], col="blue", cex=1.3);
# points(c(1:length(eval.bysample.novel$CountVariants)), eval.bysample.novel$CountVariants[ind], col="red", cex=1.3);
# legend(0, max(eval.bysample.all$CountVariants)/2, c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21);
plot(eval.ac.all$AC, eval.ac.all$n, col="black", type="l", lwd=2, cex=1.3, cex.lab=1.3, cex.axis=1.3, xlab="Allele count", ylab="Number of variants", main="", log="xy", bty="n");
points(eval.ac.known$AC, eval.ac.known$n, col="blue", type="l", lwd=2);
points(eval.ac.novel$AC, eval.ac.novel$n, col="red", type="l", lwd=2);
legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2);
#
# plot(eval.func.all$Synonymous[ind] / (eval.func.all$Missense + eval.func.all$Nonsense)[ind], ylim=c(0, 2), cex=1.3, cex.lab=1.3, cex.axis=1.3, bty="n", xlab="Sample", ylab="Ratio of synonymous to non-synonymous variants", col="black");
# points(eval.func.known$Synonymous[ind] / (eval.func.known$Missense + eval.func.known$Nonsense)[ind], cex=1.3, col="blue");
# points(eval.func.novel$Synonymous[ind] / (eval.func.novel$Missense + eval.func.novel$Nonsense)[ind], cex=1.3, col="red");
# legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21);
dev.off();
}

View File

@ -1,266 +0,0 @@
#New tearsheet generator
.libPaths('/humgen/gsa-pipeline/.repository/R/')
suppressMessages(library(gplots));
suppressMessages(library(ReadImages));
suppressMessages(library(gsalib));
tearsheet<-function(){
def.par <- par(no.readonly = TRUE)
#define layout
postable<-matrix(c(1, 1, 1, 1, rep(c(2, 2, 4, 4), 5), rep(c(3, 3, 4, 4), 3), rep(c(3,3,5,5), 5), 6,7,8,9), nrow=15, ncol=4, byrow=TRUE)
layout(postable, heights=c(1, rep(.18, 13), 2), respect=FALSE)
#prep for title bar
drop<-read.jpeg(system.file("data", "tearsheetdrop.jpg", package="gsalib"))
#plot title bar
par(mar=c(0,0,0,0))
plot(drop)
text(155, 50, cmdargs$title, family="serif", adj=c(0,0), cex=3, col=gray(.25))
print("Title created...")
# Project summary
projects = paste(squids, collapse=", ");
used_samples = nrow(settable);
unused_samples = 0;
sequencing_protocol = samp$Initiative[1]
bait_design = samp$"Bait Set"[1]
callable_target = samp$"Target Territory"[1]
table1<-rbind(paste(used_samples," used samples/", unused_samples + used_samples," total samples", sep=""), sequencing_protocol, bait_design, callable_target)
rownames(table1)<-c("Samples","Sequencing Initiative", "Bait Design","Callable Target")
par(mar=c(0,0,1,0))
textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.25, valign="top")
title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.25, line=-1)
print("Project summary created...")
# Bases summary
reads_per_lane_mean = format(mean(lane$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE);
reads_per_lane_sd = format(sd(lane$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE);
lanessum<-sprintf("%s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd)
used_bases_per_lane_mean = format(mean(lane$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE);
used_bases_per_lane_sd = format(sd(lane$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE);
lanessum<-c(lanessum, sprintf("%s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd));
target_coverage_mean = mean(na.omit(lane$"Mean Target Coverage"));
target_coverage_sd = sd(na.omit(lane$"Mean Target Coverage"));
lanessum<-c(lanessum, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd));
pct_loci_gt_10x_mean = mean(na.omit(lane$"Target Bases 10x %"));
pct_loci_gt_10x_sd = sd(na.omit(lane$"Target Bases 10x %"));
lanessum<-c(lanessum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd));
pct_loci_gt_20x_mean = mean(na.omit(lane$"Target Bases 20x %"));
pct_loci_gt_20x_sd = sd(na.omit(lane$"Target Bases 20x %"));
lanessum<-c(lanessum,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd));
pct_loci_gt_30x_mean = mean(na.omit(lane$"Target Bases 30x %"));
pct_loci_gt_30x_sd = sd(na.omit(lane$"Target Bases 30x %"));
lanessum<-c(lanessum,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));
reads_per_sample_mean = format(mean(samp$"PF Reads", na.rm=TRUE), 8, 3,1, scientific=TRUE);
reads_per_sample_sd = format(sd(samp$"PF Reads",na.rm=TRUE), 8, 3,1, scientific=TRUE);
sampssum<-sprintf("%s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd);
used_bases_per_sample_mean = format(mean(samp$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE);
used_bases_per_sample_sd = format(sd(samp$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE);
sampssum<-c(sampssum, sprintf("%s +/- %s\n", used_bases_per_sample_mean, used_bases_per_sample_sd));
target_coverage_mean = mean(na.omit(samp$"Mean Target Coverage"));
target_coverage_sd = sd(na.omit(samp$"Mean Target Coverage"));
sampssum<-c(sampssum, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd));
pct_loci_gt_10x_mean = mean(na.omit(samp$"Target Bases 10x %"));
pct_loci_gt_10x_sd = sd(na.omit(samp$"Target Bases 10x %"));
sampssum<-c(sampssum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd));
pct_loci_gt_20x_mean = mean(na.omit(samp$"Target Bases 20x %"));
pct_loci_gt_20x_sd = sd(na.omit(samp$"Target Bases 20x %"));
sampssum<-c(sampssum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd));
pct_loci_gt_30x_mean = mean(na.omit(samp$"Target Bases 30x %"));
pct_loci_gt_30x_sd = sd(na.omit(samp$"Target Bases 30x %"));
sampssum<-c(sampssum, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));
table2<-cbind(lanessum, sampssum)
used_lanes = length(unique(paste(lane$Flowcell, lane$Lane)));
if(nrow(lane)>used_lanes){
colnames(table2)<-c("Per barcoded readgroup", "Per sample")
}
else{
colnames(table2)<-c("Per lane", "Per sample")
}
rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 30x")
par(mar=c(0,0,1,0))
textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25, valign="top")
title(main="Bases Summary", family="sans", cex.main=1.25, line=0)
print("Bases summary created...")
# Sequencing summary
instrument <- c();
if(length(grep("AAXX", lane$Flowcell))>0){
instrument <- c(instrument, "Illumina GA2")
}
if(length(grep("ABXX", lane$Flowcell))>0){
instrument <- c(instrument, "Illumina HiSeq")
}
if(length(instrument)>1){
instrument<-paste(instrument[1], instrument[2], sep=" and ")
}
used_lanes = length(unique(paste(lane$Flowcell, lane$Lane)));
unused_lanes_by_sequencing = 0; #can we get this?
unused_lanes_by_analysis = 0;
lanes_per_sample_mean = mean(table(lane$"External ID"), na.rm=TRUE);
lanes_per_sample_sd = sd(table(lane$"External ID"), na.rm=TRUE);
lanes_per_sample_median = median(table(lane$"External ID"));
lanes_paired = length(unique(paste(subset(lane, lane$"Lane Type" == "Paired")$Flowcell, subset(lane, lane$"Lane Type" == "Paired")$Lane)));
lanes_widowed = length(unique(paste(subset(lane, lane$"Lane Type" == "Widowed")$Flowcell, subset(lane, lane$"Lane Type" == "Widowed")$Lane)));
lanes_single = length(unique(paste(subset(lane, lane$"Lane Type" == "Single")$Flowcell, subset(lane, lane$"Lane Type" == "Single")$Lane)));
read_length_mean = mean(lane$"Mean Read Length (P)");
read_length_sd = sd(lane$"Mean Read Length (P)");
read_length_median = median(lane$"Mean Read Length (P)");
date = sort(as.Date(lane$"Run Date", format="%d-%b-%y"));
start_date = format(date[1], "%B %d, %Y");
end_date = format(date[length(date)], "%B %d, %Y");
if(nrow(lane)>used_lanes){
used_lanes<-paste(used_lanes, " (multiplexed; ", nrow(lane), " total barcoded readgroups)", sep="")
}
table3<-rbind(paste(instrument), used_lanes, sprintf("%s rejected by sequencing, %s by analysis\n", unused_lanes_by_sequencing, unused_lanes_by_analysis), sprintf("%0.1f +/- %0.1f lanes (median=%0.1f)\n", lanes_per_sample_mean, lanes_per_sample_sd, lanes_per_sample_median), sprintf("%s paired, %s widowed, %s single\n", lanes_paired, lanes_widowed, lanes_single), sprintf("%0.1f +/- %0.1f bases (median=%0.1f)\n", read_length_mean, read_length_sd, read_length_median), sprintf("\tSequencing dates: %s to %s\n", start_date, end_date))
rownames(table3)<-c("Sequencer", "Used lanes", "Unused lanes","Used lanes/sample", "Lane parities", "Read lengths", "Sequencing dates")
par(mar=c(0,0,1,0))
textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25, valign="top")
title(main="Sequencing Summary", family="sans", cex.main=1.25, line=0)
print("Sequencing summary created...")
# Variant summary
eval.counts = basiceval$CountVariants
if("FunctionalClass" %in% colnames(eval.counts)){
eval.counts= subset(eval.counts, FunctionalClass == "all")
}
if("Sample" %in% colnames(eval.counts)){
eval.counts= subset(eval.counts, Sample == "all")
}
if("Filter" %in% colnames(eval.counts)){
eval.counts= subset(eval.counts, Filter == "called")
}
eval.counts.all = subset(eval.counts, Novelty == "all")$nVariantLoci;
eval.counts.known = subset(eval.counts,Novelty == "known")$nVariantLoci;
eval.counts.novel = subset(eval.counts, Novelty == "novel")$nVariantLoci;
eval.titv = basiceval$TiTvVariantEvaluator
if("FunctionalClass" %in% colnames(eval.titv)){
eval.titv= subset(eval.titv, FunctionalClass == "all")
}
if("Sample" %in% colnames(eval.titv)){
eval.titv= subset(eval.titv, Sample == "all")
}
if("Filter" %in% colnames(eval.titv)){
eval.titv= subset(eval.titv, Filter == "called")
}
eval.titv.all = subset(eval.titv, Novelty == "all")$tiTvRatio;
eval.titv.known = subset(eval.titv, Novelty == "known")$tiTvRatio;
eval.titv.novel = subset(eval.titv, Novelty == "novel")$tiTvRatio;
table4 = matrix(c(eval.counts.all, eval.counts.known, eval.counts.novel, eval.titv.all, eval.titv.known, eval.titv.novel, "3.0 - 3.2", "3.2 - 3.4", "2.7 - 3.0"), nrow=3);
rownames(table4) = c("All", "Known", "Novel");
colnames(table4) = c("Found", "Ti/Tv ratio", "Expected Ti/Tv ratio");
print("Variant summary created...")
par(mar=c(0,0,0,0))
textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25, valign="top")
title(main="Variant Summary", family="sans", cex.main=1.25, line=-2)
eval.bysample = SAeval$CountVariants
eval.bysample.all = subset(eval.bysample, Novelty == "all" & Sample != "all");
eval.bysample.known = subset(eval.bysample, Novelty == "known"& Sample != "all");
eval.bysample.novel = subset(eval.bysample, Novelty == "novel"& Sample != "all");
eval.bysampleTITV = SAeval$TiTvVariantEvaluator
eval.bysampleTITV.all = subset(eval.bysampleTITV, Novelty == "all" & Sample != "all");
eval.bysampleTITV.known = subset(eval.bysampleTITV, Novelty == "known"& Sample != "all");
eval.bysampleTITV.novel = subset(eval.bysampleTITV, Novelty == "novel"& Sample != "all");
eval.ac = basiceval$SimpleMetricsByAC.metrics
if("FunctionalClass" %in% colnames(eval.titv)){
eval.ac= subset(eval.ac, FunctionalClass == "all")
}
if("Sample" %in% colnames(eval.titv)){
eval.ac= subset(eval.ac, Sample == "all")
}
if("Filter" %in% colnames(eval.titv)){
eval.ac= subset(eval.ac, Filter == "called")
}
eval.ac.all = subset(eval.ac, Novelty == "all");
eval.ac.known = subset(eval.ac, Novelty == "known");
eval.ac.novel = subset(eval.ac, Novelty == "novel");
eval.func = FCeval$CountVariants
par(mar=c(5, 5, 4, 2) + 0.1)
boxplot(eval.bysampleTITV.all$tiTvRatio, eval.bysampleTITV.known$tiTvRatio, eval.bysampleTITV.novel$tiTvRatio, main="Ti/Tv by Sample", col=c("dark gray", "blue", "red"), names=c("All", "Known", "Novel"), ylab="Ti/Tv per sample", main="",cex=1.3, cex.lab=1.3, cex.axis=1.3);
par(mar=c(7, 5, 4, 2) + 0.1)
ind = order(eval.bysample.all$nVariantLoci);
plot(eval.bysample.all$nVariantLoci[ind], xlab="",pch=16, col="black", xaxt="n", cex=1.1, cex.lab=1.1, cex.axis=1.1, main="Variants per Sample", ylab="Number of variants\n(axis in log space)", bty="n", log="y",ylim=c(1, max(eval.bysample.all$nVariantLoci)));
points(eval.bysample.known$nVariantLoci[ind], pch=16, col="blue", cex=1.3);
points(eval.bysample.novel$nVariantLoci[ind], pch=16,col="red", cex=1.3);
legend("bottomleft", max(eval.bysample.all$nVariantLoci)/2, c("All", "Known", "Novel"), , col=c("black", "blue", "red"), pt.cex=1.3, pch=16);
if(nrow(samp)<25){
axis(1, at=c(1:length(eval.bysample.all$Sample[ind])), lab=eval.bysample.all$Sample[ind], cex=.7, las=2 )
}else{
axis(1, at=c(1:nrow(samp)), lab=rep("", nrow(samp)), cex=0.1, las=2, lwd.ticks=0)
title(xlab="Sample\n(too many individuals to label)")
}
par(mar=c(6, 5, 4, 2) + 0.1)
plot(sort(eval.ac.all$AC), eval.ac.all$n[order(eval.ac.all$AC)], ylim=c(1, max(eval.ac$n)), col="black", type="l", lwd=2, cex=1.1, cex.lab=1.1, cex.axis=1.1, xlab="Allele count\n(axis in log space)", ylab="Number of variants\n(axis in log space)", main="Variants by Allele Count", log="xy", bty="n");
points(sort(eval.ac.known$AC), eval.ac.known$n[order(eval.ac.known$AC)], col="blue", type="l", lwd=2);
points(sort(eval.ac.novel$AC), eval.ac.novel$n[order(eval.ac.novel$AC)], col="red", type="l", lwd=2);
if(nrow(samp)<25){
legend("bottomleft", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2);
}else{
legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2);
}
par(mar=c(5, 5, 4, 2) + 0.1)
barplot(eval.func$nVariantLoci[4:nrow(eval.func)], col=c("dark gray", "blue", "red"), space=c(.2,0,0), log="y", main="Variants by Functional Class", xlab="Functional Class", ylab="Number of variants\n(axis in log space)")
axis(1, at=c(1.5,5,8.5), lab=c("Missense", "Nonsense", "Silent"), cex=.5, tick=FALSE)
legend("top", c("All", "Known", "Novel"), fill=c("dark gray", "blue", "red"), cex=.7);
print("Graphs created...")
print("All done!")
par(def.par)#- reset to default
}

View File

@ -1,41 +0,0 @@
source("/humgen/gsa-pipeline/.repository/R/DataProcessingReport/qcplots.r")
suppressMessages(library(gplots));
def.par <- par(no.readonly = TRUE)
cmdargs = gsa.getargs(
list(
tsv = list(value=NA, doc="pipeline tsv file"),
evalroot = list(value=NA, doc="VariantEval file base (everything before the .eval)"),
reportout = list(value=NA, doc="Output path for report PDF")#,
),
doc="Creates a variant report"
);
read.delim(cmdargs$tsv, header=FALSE)->settable
squids<-unique(settable[,1])
gsa.read.gatkreport(paste(cmdargs$evalroot, ".eval", sep=""))->basiceval
gsa.read.gatkreport(paste(cmdargs$evalroot, ".extraSA.eval", sep=""))->SAeval
print("Evals read")
pdf(file= cmdargs$reportout, width=22, height=17, pagecentre=TRUE, pointsize=24)
print("PDF created...")
path="."
weirdos<-which(SAeval$TiTvVariantEvaluator$Sample %in% SAeval$TiTvVariantEvaluator$Sample[which(SAeval$TiTvVariantEvaluator$tiTvRatio <2)])
novelAC(SAeval)
knownAC(SAeval)
AllAC(SAeval)
layout(matrix(c(6,1, 2,3, 4, 5), nrow=6), heights=c(1, 1, 1, 1, 1,1))
textplot("Sample Novel TiTv ranges should be above 2, as they are in previous datasets. \nSamples with lower TiTv data are flagged in subsequent plots with hot pink labels, and listed below:")
textplot(paste(unique(SAeval$TiTvVariantEvaluator$Sample[weirdos]), collapse=", "), halign="left")
textplot("Problem Samples frequently have unusually high or low numbers of variants.")
textplot("Samples with unusually high numbers of novel variants may be from different populations, and, as such, should have higher heterozygosity. \nIf this is not the case, there may be problems with the samples.")
textplot("Unusually high numbers of variants with low allele counts may indicate variants generated from problematic samples.")
textplot("Notes for interpreting QC data:")
dev.off()

View File

@ -1,177 +0,0 @@
#preqc.r
library(gplots)
.libPaths('/humgen/gsa-pipeline/.repository/R/')
library(gsalib)
cmdargs = gsa.getargs(
list(
tsv = list(value=NA, doc="pipeline tsv file"),
qcout=list(value=NA, doc="path to output root")
),
doc="Creates a tearsheet"
);
read.delim(cmdargs$tsv, header=FALSE)->settable
squids<-unique(settable[,1])
print(paste(nrow(settable), "samples in tsv"))
lane<-data.frame()
samp<-data.frame()
for(squid in squids){
gsa.read.squidmetrics(squid, TRUE)->lanemetrics
print(paste("Got lane metrics for", squid))
addlanes<-lanemetrics[which(lanemetrics$"External ID" %in% settable[,2]),]
gsa.read.squidmetrics(squid, FALSE)->samplemetrics
print(paste("Got sample metrics for", squid))
addsamps<-samplemetrics[which(samplemetrics$Sample %in% settable[,2]),]
lane<-rbind(lane, addlanes)
samp<-rbind(samp, addsamps)
}
print(paste(nrow(samp), "samples in samp"))
print(paste(length(unique(lane$"External ID")), "samples in lane"))
print(paste(setdiff(settable[,2], samp$Sample), "do not overlap between samp and tsv"))
print(paste(setdiff(settable[,2], lane$"External ID"), "do not overlap between lane and tsv"))
print(paste(setdiff(samp$Sample, lane$"External ID"), "do not overlap between lane and samp"))
missingSamp<-setdiff(settable[,2], samp$Sample)
missingLane<-setdiff(settable[,2], lane$"External ID")
drv = dbDriver("Oracle");
con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD");
rs = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS"));
d = fetch(rs, n=-1);
dbHasCompleted(rs);
dbClearResult(rs);
rs2 = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG"));
d2 = fetch(rs2, n=-1);
dbHasCompleted(rs2);
dbClearResult(rs2);
oraCloseDriver(drv);
compsamp=d2[which(d2$"Bait Set" %in% samp$"Bait Set"),]
complane=d[which(d$"Bait Set" %in% lane$"Bait Set"),]
pdf(paste(cmdargs$qcout, "pdf", sep="."), width=11, height=8.5)
plot(samp$"Target Bases 20x %", main="Coverage to 20x", ylab="% Targets Covered to 20x", xlab="Sample", ylim=c(0,100))
abline(h=80, lty=2)
legend("bottomright", lty=2, legend="80% coverage to 20x")
lowcoverage<-samp$Sample[which(samp$"Target Bases 20x %"<80)]
if(length(lowcoverage)>0){
text(which(samp$"Target Bases 20x %"<80),samp$"Target Bases 20x %"[which(samp$"Target Bases 20x %"<80)], labels=samp$Sample[which(samp$"Target Bases 20x %"<80)], pos=2, srt=270, cex=.6, col="hotpink")
}
plot(samp$"Zero Coverage Targets %", main="Zero Coverage", ylab="% Targets with zero coverage", log="y", xlab="Sample", ylim=c(0.01,100))
abline(h=3, lty=2)
legend("bottomright", lty=2, legend="3% Targets Zero Coverage")
lowcoverage<-c(lowcoverage,samp$Sample[which(samp$"Zero Coverage">3)])
if(length(which(samp$"Zero Coverage Targets %">3))>0){
text(which(samp$"Zero Coverage Targets %">3), samp$"Zero Coverage Targets %"[which(samp$"Zero Coverage Targets %">3)], labels=samp$Sample[which(samp$"Zero Coverage Targets %">3)], pos=2, srt=270, cex=.6, col="hotpink")
}
print("Coverage stats done")
nofp<-lane$"External ID"[which(is.na(lane$"FP LOD"))]
if(length(which(is.na(lane$"FP LOD")))< nrow(lane)){
plot(lane$"FP Confident Calls"~as.factor(lane$"External ID"), xlab="sample", ylab="Multiplex level # FP calls", main="Fingerprint Calls/Sample Instance", xaxt="n")
medians<-tapply(lane$"FP Confident Calls",lane$"External ID", median, na.rm=TRUE)
points(as.factor(dimnames(medians)[[1]]),medians,col="red", lwd=2)
legend("topleft", legend="Median across sample instances", pch=1, lwd=2, col="red", lty=0)
poorFPcov<-dimnames(medians)[[1]][which(medians<5 )]
if(length(poorFPcov)>0){
text(which(medians<5), medians[which(medians<5)],poorFPcov, pos=2, srt=270, cex=.6, col="hotpink")
}
print("1 fp plot")
plot(100*(lane$"FP Confident Matching SNPs"/lane$"FP Confident Calls")~as.factor(lane$"External ID"), xlab="sample", ylab="Multiplex level % matching FP calls", main="% Confident calls matching for samples with low confident calls", xaxt="n", ylim=c(0,110))
print("2 fp plot")
plot(lane$"FP LOD"~as.factor(lane$"External ID"), xlab="sample", ylab="Sample Fingerprint LOD", main="Fingerprint Pass:Samples", xaxt="n")
offsamps<-lane$"External ID"[which(lane$"FP LOD"<(-3))]
lowfpLOD<-lane$"External ID"[which(lane$"FP LOD"<6)]
if(length(lowfpLOD)>0){
text(which(lane$"External ID" %in% lowfpLOD), lane$"FP_LOD"[which(lane$"FP LOD"<6)], labels=lowfpLOD, pos=2, srt=270, cex=.6, col="hotpink")
}
print("3 fp plot")
if(length(lowfpLOD)>0){
plot((lane$"FP Confident Calls"-lane$"FP Confident Matching SNPs")~as.factor(lane$"External ID"), main="Calls vs Matching Calls for Samples failing FP QC", ylab="# Mismatches", xlab="")
}
if(length(lowfpLOD)>0){
text(which(lane$"FP LOD"<6), lane$"FP_LOD"[which(lane$"FP LOD"<6)], labels=lowfpLOD, pos=2, srt=270, cex=.6, col="RED")
}
}else{
offsamps<-"NO FPDATA"
lowfpLOD<-"NO FP DATA"
poorFPcov<-"NO FP DATA"
}
print("FP stats done")
boxplot(samp$"Total SNPs", compsamp$"Total SNPs", names=c("Current Set", "All Sets"), ylab="Total SNPs per sample", main="Total SNPs")
standardQuants<-boxplot.stats(compsamp$"Total SNPs")$stats
offSNPs<-samp$Sample[which(samp$"Total SNPs" <standardQuants[1])]
offSNPs<-c(offSNPs, samp$Sample[which(samp$"Total SNPs" >standardQuants[5])])
if(length(offSNPs >0)){
text(1, samp$"Total SNPs"[which(samp$Sample %in% offSNPs)], labels=offSNPs, pos=2, col="hot pink")
}
print("SNP stats done")
boxplot(samp$"dbSNP %", compsamp$"dbSNP %", names=c("Current Set", "All Sets"), ylab="% SNPs in dbSNP per sample", main="dbSNP Percentage")
standardQuants<-boxplot.stats(compsamp$"dbSNP %")$stats
offdbSNP<-samp$Sample[which(samp$"dbSNP %" <standardQuants[1])]
offdbSNP<-c(offdbSNP, samp$Sample[which(samp$"dbSNP %" >standardQuants[5])])
if(length(offdbSNP >0)){
text(1, samp$"dbSNP %"[which(samp$Sample %in% offdbSNP)], labels=offdbSNP, pos=2, col="hot pink")
}
print("DBSNP stats done")
sampDuplication<-sub(pattern="Catch-.*: ", "",samp$"Library Duplication %")
sampDuplication<-as.numeric(sub("%", "", sampDuplication))
compsampDuplication<-sub(pattern="Catch-.*: ", "",compsamp$"Library Duplication %")
compsampDuplication<-as.numeric(sub("%", "", compsampDuplication))
boxplot(sampDuplication, compsampDuplication, names=c("Current Set", "All Sets"), ylab="% Duplication", main="Library Duplication")
standardQuants<-boxplot.stats(compsampDuplication)$stats
offDup<-samp$Sample[which(sampDuplication <standardQuants[1])]
offDup<-c(offDup, samp$Sample[which(sampDuplication >standardQuants[5])])
if(length(offDup >0)){
text(1, sampDuplication[which(samp$Sample %in% offDup)], labels=offDup, pos=2, col="hot pink")
}
print("Duplication stats done")
allproblemsamples<-unique(c(lowcoverage, poorFPcov, offsamps, lowfpLOD, offSNPs, offdbSNP, offDup, missingLane, missingSamp))
problemMat<-matrix(c(rep("PASS", length(allproblemsamples)*9)), nrow=length(allproblemsamples))
rownames(problemMat)<-allproblemsamples
colnames(problemMat)<-c("low coverage", "low fp cov", "Identity Fail", "low FP LOD", "weird SNP count", "weird dbSNP %", "Duplicated", "Missing lane data", "missing agg data")
problemMat[which(rownames(problemMat) %in% lowcoverage),1]<-"FAIL"
problemMat[which(rownames(problemMat) %in% poorFPcov),2]<-"FAIL"
problemMat[which(rownames(problemMat) %in% offsamps),2]<-"FAIL"
problemMat[which(rownames(problemMat) %in% lowfpLOD),4]<-"FAIL"
problemMat[which(rownames(problemMat) %in% offSNPs),5]<-"FAIL"
problemMat[which(rownames(problemMat) %in% offdbSNP),6]<-"FAIL"
problemMat[which(rownames(problemMat) %in% offDup),7]<-"FAIL"
problemMat[which(rownames(problemMat) %in% missingLane),8]<-"FAIL"
problemMat[which(rownames(problemMat) %in% missingSamp),9]<-"FAIL"
textplot(problemMat, cex=.5)
write.table(problemMat, file=paste(cmdargs$qcout,"qc.table",sep="."), quote=FALSE, sep="\t")
print("no fp")
print(unique(nofp))
dev.off()
print("All stats done")

View File

@ -1,181 +0,0 @@
.libPaths('/humgen/gsa-firehose2/pipeline/repositories/StingProduction/R/')
.libPaths('~/Documents/Sting/R/')
library(gsalib)
def.par <- par(no.readonly = TRUE)
titvplot<-function(current){
par(mfcol=c(1,2))
titvs<-c()
status<-c()
for(i in c(1:12)){
load(sprintf("%sexome.%i", path, i));
info<-subset(data$TiTvVariantEvaluator, Sample!="all")
titvs<-c(titvs, info$tiTvRatio)
status<-c(status, info$Novelty)
print(length(titvs))
print(length(status))
}
print(length(unique(current$TiTvVariantEvaluator$Sample))-1)
length(unique(current$TiTvVariantEvaluator$Sample))-1+length(titvs[which(status=="novel")])->nvalues
print(length(titvs[which(status=="novel")]))
print(nvalues)
plot(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="novel")], xlim=c(0,nvalues), ylim=c(0,4), col="red", main="Current samples compared to previous samples from 12 sets", ylab="Per sample Ti/Tv", xlab="sample")
points(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="known")], col="blue")
points(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="all")], col="black")
points(c(length(unique(current$TiTvVariantEvaluator$Sample)):nvalues), titvs[which(status=="novel")], pch=16, col="red")
points(c(length(unique(current$TiTvVariantEvaluator$Sample)):nvalues), titvs[which(status=="known")], pch=16, col="blue")
points(c(length(unique(current$TiTvVariantEvaluator$Sample)):nvalues), titvs[which(status=="all")], pch=16, col="black")
legend("bottomleft", col=c("red", "blue", "black"), pch=c(1,1,1,16,16, 16),legend=c("novel variants:current set", "known variants:current set", "all varaints:current set", "novel variants:previous sets", "known variants:previous sets", "all variants: previous sets"))
weirdos<-which(current$TiTvVariantEvaluator$Sample %in% current$TiTvVariantEvaluator$Sample[which(current$TiTvVariantEvaluator$tiTvRatio <2.0)])
if(length(weirdos)>0){
text(weirdos[c(1:(length(weirdos)/3))],current$TiTvVariantEvaluator$tiTvRatio[weirdos], labels=current$TiTvVariantEvaluator$Sample[weirdos], pos=4, cex=.7, col="hot pink")
}
boxplot(current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="novel")],titvs[which(status=="novel")], current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="known")],titvs[which(status=="known")], current$TiTvVariantEvaluator$tiTvRatio[which(current$TiTvVariantEvaluator$Sample!="all" & current$TiTvVariantEvaluator$Novelty=="all")], titvs[which(status=="all")], col=rep(c("red", "blue", "black"), each=2), main="Current v. Previous per sample Ti/TV", xlab="Sample Sets",ylab="Ti/Tv per sample", xaxt="n" )
axis(side=1, at=c(1:6)-.2, labels=rep(c("current", "previous"), 3), cex.axis=.7)
legend("bottomleft",legend=c("novel", "known", "all"), fill=c("red", "blue", "black"))
if(length(weirdos)>0){
text(rep(c(5,3,1), each=(length(weirdos)/3)),current$TiTvVariantEvaluator$tiTvRatio[weirdos], labels=current$TiTvVariantEvaluator$Sample[weirdos], pos=4, cex=.7, col="hot pink")
}
par(def.par)#- reset to default
}
variantplots<-function(current){
par(mfcol=c(1,2))
variants<-c()
status<-c()
for(i in c(1:12)){
load(sprintf("%s/exome.%i", path, i));
info<-subset(data$CountVariants, Sample!="all")
variants<-c(variants, info$nSNPs)
status<-c(status, info$Novelty)
}
length(unique(current$CountVariants$Sample))-1+length(variants[which(status=="novel")])->nvalues
plot(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")], xlim=c(0,nvalues), ylim=c(1,25000), log="y", col="red", main="Current samples compared to previous samples from 12 sets", ylab="Per sample #SNPs", xlab="sample")
points(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")], col="blue")
points(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], col="black")
points(c(length(unique(current$CountVariants$Sample)):nvalues), variants[which(status=="novel")], pch=16, col="red")
points(c(length(unique(current$CountVariants$Sample)):nvalues), variants[which(status=="known")], pch=16, col="blue")
points(c(length(unique(current$CountVariants$Sample)):nvalues), variants[which(status=="all")], pch=16, col="black")
legend("bottomleft", col=c("red", "blue", "black"), pch=c(1,1,1,16,16, 16),legend=c("novel variants:current set", "known variants:current set", "all varaints:current set", "novel variants:previous sets", "known variants:previous sets", "all variants: previous sets"))
weirdos<-which(current$CountVariants$Sample %in% current$TiTvVariantEvaluator$Sample[which(current$TiTvVariantEvaluator$tiTvRatio <2.0)])
if(length(weirdos)>0){
text(weirdos[c(1:(length(weirdos)/3))],current$CountVariants$nSNPs[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink")
}
boxplot(current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")],variants[which(status=="novel")], current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")],variants[which(status=="known")], current$CountVariants$nSNPs[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], variants[which(status=="all")], col=rep(c("red", "blue", "black"), each=2), main="Current v. Previous per sample #SNPs", xlab="Sample Sets",ylab="SNPs per sample", xaxt="n", ylim=c(10,25000), log="y")
axis(side=1, at=c(1:6)-.2, labels=rep(c("current", "previous"), 3), cex.axis=.7)
if(length(weirdos)>0){
text(rep(c(5,3,1), each=(length(weirdos)/3)),current$CountVariants$nSNPs[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink")
}
legend("topleft",legend=c("novel", "known", "all"), fill=c("red", "blue", "black"))
par(def.par)#- reset to default
}
heteroplots<-function(current){
par(mfcol=c(1,2))
hets<-c()
status<-c()
for(i in c(1:12)){
load(sprintf("%s/exome.%i", path, i));
info<-subset(data$CountVariants, Sample!="all")
hets<-c(hets, info$heterozygosity)
status<-c(status, info$Novelty)
}
length(unique(current$CountVariants$Sample))-1+length(hets[which(status=="novel")])->nvalues
plot(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")], xlim=c(0,nvalues), ylim=c(-0.0005, 0.0005), col="red", main="Current samples compared to previous samples from 12 sets", ylab="Per sample heterozygosity", xlab="sample")
points(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")], col="blue")
points(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], col="black")
points(c(length(unique(current$CountVariants$Sample)):nvalues), hets[which(status=="novel")], pch=16, col="red")
points(c(length(unique(current$CountVariants$Sample)):nvalues), hets[which(status=="known")], pch=16, col="blue")
points(c(length(unique(current$CountVariants$Sample)):nvalues), hets[which(status=="all")], pch=16, col="black")
legend("bottomleft", col=c("red", "blue", "black"), pch=c(1,1,1,16,16, 16),legend=c("novel variants:current set", "known variants:current set", "all varaints:current set", "novel variants:previous sets", "known variants:previous sets", "all variants: previous sets"))
weirdos<-which(current$CountVariants$Sample %in% current$TiTvVariantEvaluator$Sample[which(current$TiTvVariantEvaluator$tiTvRatio <2.0)])
if(length(weirdos)>0){
text(weirdos[c(1:(length(weirdos)/3))],current$CountVariants$heterozygosity[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink")
}
boxplot(current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="novel")],hets[which(status=="novel")], current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="known")],hets[which(status=="known")], current$CountVariants$heterozygosity[which(current$CountVariants$Sample!="all" & current$CountVariants$Novelty=="all")], hets[which(status=="all")], col=rep(c("red", "blue", "black"), each=2), main="Current v. Previous per sample #Heterozygousity", xlab="Sample Sets",ylab="Heterozygousity per sample", xaxt="n")
axis(side=1, at=c(1:6)-.2, labels=rep(c("current", "previous"), 3), cex.axis=.7)
if(length(weirdos)>0){
text(rep(c(5,3,1), each=(length(weirdos)/3)),current$CountVariants$heterozygosity[weirdos], labels=current$CountVariants$Sample[weirdos], pos=4, cex=.7, col="hot pink")
}
legend("topleft",legend=c("novel", "known", "all"), fill=c("red", "blue", "black"))
par(def.par)#- reset to default
}
novelAC<-function(current){
ACs<-sort(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="novel")])
orderbyAC<-order(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="novel")])
varbyAC<-current$SimpleMetricsByAC.metrics$n[which(current$SimpleMetricsByAC.metrics$Novelty=="novel")][orderbyAC]
plot(ACs, varbyAC, type="l", log="xy", lwd=4, col="dark red", main="Novel AC", ylab="# variants (log scale)", xlab="AC (log scale)")
for(i in c(1:12)){
load(sprintf("%s/exome.%i", path, i));
info<-data$SimpleMetricsByAC.metrics
ACs<-sort(info$AC[which(info$Novelty=="novel")])
orderbyAC<-order(info$AC[which(info$Novelty=="novel")])
varbyAC<-info$n[which(info$Novelty=="novel")][orderbyAC]
lines(ACs, varbyAC, col="red")
}
legend("topright",legend=c("current", "previous"), lwd=c(4,1), col=c("dark red", "red"))
}
knownAC<-function(current){
ACs<-sort(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="known")])
orderbyAC<-order(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="known")])
varbyAC<-current$SimpleMetricsByAC.metrics$n[which(current$SimpleMetricsByAC.metrics$Novelty=="known")][orderbyAC]
plot(ACs, varbyAC, type="l", log="xy", lwd=4, col="dark blue", main="Known AC", ylab="# variants (log scale)", xlab="AC (log scale)")
for(i in c(1:12)){
load(sprintf("%s/exome.%i", path, i));
info<-data$SimpleMetricsByAC.metrics
ACs<-sort(info$AC[which(info$Novelty=="known")])
orderbyAC<-order(info$AC[which(info$Novelty=="known")])
varbyAC<-info$n[which(info$Novelty=="known")][orderbyAC]
lines(ACs, varbyAC, col="light blue")
}
legend("topright",legend=c("current", "previous"), lwd=c(4,1), col=c("dark blue", "light blue"))
}
AllAC<-function(current){
ACs<-sort(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="all")])
orderbyAC<-order(current$SimpleMetricsByAC.metrics$AC[which(current$SimpleMetricsByAC.metrics$Novelty=="all")])
varbyAC<-current$SimpleMetricsByAC.metrics$n[which(current$SimpleMetricsByAC.metrics$Novelty=="all")][orderbyAC]
plot(ACs, varbyAC, type="l", log="xy", lwd=4, col="Black", main="All AC", ylab="# variants (log scale)", xlab="AC (log scale)")
for(i in c(1:12)){
load(sprintf("%s/exome.%i", path, i));
info<-data$SimpleMetricsByAC.metrics
ACs<-sort(info$AC[which(info$Novelty=="all")])
orderbyAC<-order(info$AC[which(info$Novelty=="all")])
varbyAC<-info$n[which(info$Novelty=="all")][orderbyAC]
lines(ACs, varbyAC, col="dark grey")
}
legend("topright",legend=c("current", "previous"), lwd=c(4,1), col=c("black", "dark grey"))
}

View File

@ -1,34 +0,0 @@
source("/humgen/gsa-pipeline/.repository/R/DataProcessingReport/Tearsheet.R")
cmdargs = gsa.getargs(
list(
title = list(value=NA, doc="Title for the tearsheet"),
tsv = list(value=NA, doc="pipeline tsv file"),
evalroot = list(value=NA, doc="VariantEval file base (everything before the .eval)"),
tearout = list(value=NA, doc="Output path for tearsheet PDF")#,
),
doc="Creates a tearsheet"
);
read.delim(cmdargs$tsv, header=FALSE)->settable
squids<-unique(settable[,1])
lane<-data.frame()
samp<-data.frame()
for(squid in squids){
gsa.read.squidmetrics(squid, TRUE)->lanemetrics
addlanes<-lanemetrics[which(lanemetrics$"External ID" %in% settable[,2]),]
gsa.read.squidmetrics(squid, FALSE)->samplemetrics
addsamps<-samplemetrics[which(samplemetrics$"Sample" %in% settable[,2]),]
lane<-rbind(lane, addlanes)
samp<-rbind(samp, addsamps)
}
print("Picard Data Obtained...")
gsa.read.gatkreport(paste(cmdargs$evalroot, ".eval", sep=""))->basiceval
gsa.read.gatkreport(paste(cmdargs$evalroot, ".extraFC.eval", sep=""))->FCeval
gsa.read.gatkreport(paste(cmdargs$evalroot, ".extraSA.eval", sep=""))->SAeval
print("Evals read")
pdf(file= cmdargs$tearout, width=22, height=17, pagecentre=TRUE, pointsize=24)
print("PDF created...")
tearsheet()
dev.off()

View File

@ -1,194 +0,0 @@
require("plotrix")
args = commandArgs(TRUE);
onCMDLine = ! is.na(args[1])
if (! is.na(args[3]) ) { name = args[3] } else { name = "" }
if ( onCMDLine ) {
print(paste("Reading data from", args[1]))
d = read.table(args[1], header=T, sep="\t")
#d$start.time = as.Date(d$start.time)
d$end.time = as.Date(d$end.time)
} # only read into d if its' available, otherwise assume the data is already loaded
# The unknown records are from the Broad
d$domain.name[d$domain.name == "unknown"] = "broadinstitute.org"
noRecords <- function(name) {
print(paste("No records", name))
frame()
title(paste("No records of", name), cex=2)
}
reportCountingPlot <- function(values, name, moreMargin = 0, ...) {
#print(length(values))
if ( length(values) > 0 ) {
par(las=2) # make label text perpendicular to axis
oldMar <- par("mar")
par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin.
t = table(factor(values))
barplot(sort(t), horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", log="x", ...)
par("mar" = oldMar)
par("las" = 1)
} else {
noRecords(name)
}
}
reportConditionalCountingPlot <- function(values, conditions, name, moreMargin = 0, ...) {
if ( length(values) > 0 ) {
t = table(values, conditions)
t = t[, order(colSums(t))]
#print(list(t = t))
if ( ! is.null(dim(t)) ) {
par(las=2) # make label text perpendicular to axis
oldMar <- par("mar")
par(mar=c(5,8+moreMargin,4,2)) # increase y-axis margin.
nconds = dim(t)[2]
cols = rainbow(nconds)
barplot(t, legend.text = T, horiz=TRUE, cex.names = 0.5, main = name, xlab="Counts", col=cols, cex=0.5, ...)
par("mar" = oldMar)
par("las" = 1)
} else {
noRecords(name)
}
} else {
noRecords(name)
}
}
reportHist <- function(values, name, ...) {
if ( ! all(is.na(values) ) )
hist(values, main=name, 20, xlab="", col="cornflowerblue", ...)
}
myTable <- function(x, y, reqRowNonZero = F) {
table <- prop.table(table(x, y), 2)
ncols = dim(table)[2]
#print(table)
if ( reqRowNonZero )
table = table[addmargins(table)[1:dim(table)[1],ncols] > 0,]
return(table)
}
# todo -- must be robust to smaller sizes
plotTable <- function(table, name, ...) {
ncols = dim(table)[2]
nrows = dim(table)[1]
if ( ! is.null(nrows) ) {
cols = rainbow(nrows)
tableMin = min(apply(table, 2, min))
tableMax = max(apply(table, 2, max))
plot( as.numeric(apply(table, 2, sum)), ylim=c(tableMin, tableMax), type="n", main = name, ylab="Frequency", xlab="Date", xaxt="n", ...)
axis(1, 1:ncols, labels=colnames(table))
for ( i in 1:nrows )
points(table[i,], type="b", col=cols[i])
legend("topright", row.names(table), fill=cols, cex=0.5)
#return(table)
}
}
RUNNING_GATK_RUNTIME <- 60 * 5 # 5 minutes => bad failure
if ( onCMDLine ) pdf(args[2])
successfulRuns <- function(d) {
x <- rep("Successful", length(d$exception.msg))
x[d$exception.msg != "NA" & d$is.user.exception == "true"] <- "Failed with UserException"
x[d$exception.msg != "NA" & d$is.user.exception == "false"] <- "Failed with StingException"
x[d$exception.msg != "NA" & (d$is.user.exception == "NA" | is.na(d$is.user.exception))] <- "Failed with StingException before UserException code"
return(x)
}
addSection <- function(name) {
par("mar", c(5, 4, 4, 2))
frame()
title(name, cex=2)
}
dropit <- function (d, columns = names(d), ...)
{
d[columns] = lapply(d[columns], "[", drop=TRUE, ...)
d
}
generateOneReport <- function(d, header, includeByWeek = T) {
head <- function(s) {
return(paste("Section:", header, "\n", s))
}
excepted <- dropit(subset(d, exception.msg != "NA"))
UserExceptions <- dropit(subset(excepted, is.user.exception == "true"))
StingExceptions <- dropit(subset(excepted, is.user.exception == "false" | is.user.exception == "NA" | is.na(is.user.exception)))
addSection(paste("GATK run report", name, "for", Sys.Date(), "\nwith", dim(d)[1], "run repository records"))
reportCountingPlot(d$walker.name, head("Walker invocations"))
reportConditionalCountingPlot(d$user.name, d$walker.name, head("Walker invocations by user"))
reportCountingPlot(d$svn.version, head("SVN version"))
reportConditionalCountingPlot(d$svn.version, d$user.name, head("SVN by user"))
# cuts by time
if ( includeByWeek ) {
plotTable(table(rep("GATK Invocations", length(d$end.time)), cut(d$end.time, "weeks")), head("GATK Invocations by week"))
plotTable(myTable(successfulRuns(d), cut(d$end.time, "weeks")), head("Successful and failing GATK invocations per week"))
plotTable(myTable(d$svn.version, cut(d$end.time, "weeks")), head("SVN version by week"))
}
plotTable(table(rep("GATK Invocations", length(d$end.time)), d$end.time), head("GATK Invocations by day"))
plotTable(myTable(d$svn.version, d$end.time), head("SVN version by day"))
#
# Exception handling
#
addExceptionSection <- function(subd, subname, exceptionColor) {
addSection(paste(subname))
#print(list(subd = length(subd$end.time), name=subname))
reportCountingPlot(subd$walker.name, head(paste("Walkers with", subname)), col=exceptionColor)
reportCountingPlot(subd$exception.at, head(paste(subname, "locations")), 12, col=exceptionColor)
#reportCountingPlot(subd$exception.msg, head(paste(subname, "messages")), 12, col=exceptionColor)
reportConditionalCountingPlot(subd$user.name, subd$exception.at, head(paste("Walker invocations by user for", subname)), 12)
if ( includeByWeek && length(subd$end.time) > 0 ) {
plotTable(myTable(subd$walker.name, cut(subd$end.time, "weeks"), reqRowNonZero = T), head(paste("Walkers with", subname,"by week")), col=exceptionColor)
}
}
addExceptionSection(excepted, "Exceptions", "grey")
reportCountingPlot(excepted$user.name, head("Usernames generating exceptions"), col="grey")
addExceptionSection(StingExceptions, "StingExceptions", "red")
addExceptionSection(UserExceptions, "UserExceptions", "blue")
Gb <- 1024^3
reportHist(d$total.memory / Gb, head("Used memory"))
reportHist(d$max.memory / Gb, head("Max memory"))
min <- 60
reportHist(log10(d$run.time / min), head("Run time (log10[min])"))
reportCountingPlot(d$user.name, head("user"))
reportCountingPlot(d$domain.name, head("Domain name"))
#reportCountingPlot(d$host.name, head("host"))
reportCountingPlot(d$java, head("Java version"))
#reportCountingPlot(d$machine, head("Machine"))
#reportCountingPlot(d$working.directory, head("Working directory"))
}
RUNME = T
if ( RUNME ) {
lastWeek = levels(cut(d$end.time, "weeks"))[-1]
generateOneReport(d, "Overall")
#generateOneReport(subset(d, end.time >= lastWeek), "Just last week to date", includeByWeek = F)
}
if ( onCMDLine ) dev.off()

View File

@ -1,144 +0,0 @@
args <- commandArgs(TRUE)
docBase <- args[1]
## APPEND THE SUFFIXES ##
locusStats <- paste(docBase,".sample_locus_statistics",sep="")
targetStats <- paste(docBase,".sample_interval_statistics",sep="")
sampleSum <- paste(docBase,".sample_summary_statistics",sep="")
sampleStats <- paste(docBase,".sample_statistics",sep="")
targetSum <- paste(docBase,".sample_interval_summary",sep="")
## DEFINE THE PLOTTING FUNCTIONS ##
PlotDepths <- function(X) {
pdf("Depth_Histogram_All_Samples.pdf")
Y <- as.matrix(X)
colors <- rainbow(nrow(Y),gamma=0.8)
plot(Y[1,],col=colors[1],type="b",xlab="",xaxt="n",ylab="Number of Loci")
axis(1,labels=FALSE)
labels <- colnames(X)
text(1:ncol(Y),par("usr")[3]-(100/6000)*par("usr")[4],srt=45,adj=1,labels=labels,xpd=TRUE,cex=0.7)
for ( jj in 2:nrow(Y) ) {
points(Y[jj,],col=colors[jj],type="b")
}
ymax = par("usr")[4]
xmax = par("usr")[2]
legend(y=0.95*ymax,x=0.8*xmax,col=colors,rownames(X),lty=c(1),cex=0.5)
dev.off()
}
PlotLocusQuantiles <- function(X) {
pdf("Per_Sample_Coverage_Quantiles.pdf")
Y <- as.matrix(X)
Y <- Y/sum(Y[1,])
Z <- matrix(nrow=nrow(Y),ncol=ncol(Y))
for ( ii in 1:nrow(Y) ) {
for ( jj in 1:ncol(Y) ) {
# see how much density is in the remaining columns
Z[ii,jj] = sum(Y[ii,jj:ncol(Y)])
}
}
medians = matrix(nrow=1,ncol=ncol(Z))
quan90 = matrix(nrow=1,ncol=ncol(Z))
for ( cc in 1:ncol(Z) ) {
medians[cc] = quantile(Z[,cc],0.75)
quan90[cc] = quantile(Z[,cc],1)
}
plot(t(medians),xlab="",xaxt="n",ylab="Proportion of loci with >X coverage",type="b",col="blue",yaxp=c(0,1,10))
axis(1,labels=FALSE)
parseColNames <- function(K) {
M = matrix(nrow=1,ncol=length(K))
number = 0
for ( lab in K ) {
number = 1 + number
g = unlist(strsplit(lab,split="_"))
M[1,number] = g[2]
}
return(M)
}
labels <- parseColNames(colnames(X))
text(1:length(labels),par("usr")[3]-0.025,srt=90,adj=1,labels=labels,xpd=TRUE,cex=(0.8/32)*length(labels),lheight=(0.8/32)*length(labels))
points(t(quan90),type="b",col="red")
legend(x=floor(0.6*length(labels)),y=1,c("75% of samples","100% of samples"),col=c("red","blue"),lty=c(1,1))
dev.off()
}
HistogramMedians <- function(X) {
pdf("Per_Sample_Median_Histogram.pdf")
hist(as.numeric(as.matrix(unlist(X[1:nrow(X)-1,5]))),floor(nrow(X)/2),xlab="Median Coverage",ylab="Number of Samples", main="Median coverage acrosss samples",col="grey")
dev.off()
}
HeatmapLocusTable <- function(X) {
pdf("Locus_Coverage_HeatMap.pdf")
Y <- as.matrix(X)
heatmap(Y,Rowv=NA,Colv=NA)
dev.off()
}
PlotMeanMedianQuartiles <- function(X) {
pdf("Per_Sample_Mean_Quantile_Coverage.pdf")
colors <- rainbow(4,start=0.6,end=0.9,gamma=1)
means = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,3])))
medians = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,5])))
thirdQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,4])))
firstQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,6])))
plot(means,xlab="",ylab="Depth of Coverage",xaxt="n",col=colors[1],pch=3,type="b",ylim=c(0,max(thirdQ)))
points(firstQ,col=colors[2],pch=2,type="b")
points(medians,col=colors[3],pch=1,type="b")
points(thirdQ,col=colors[4],pch=2,type="b")
axis(1,labels=FALSE)
labels <- X[1:nrow(X)-1,1]
text(1:nrow(X)-1,par("usr")[3]-(50/2500)*par("usr")[4],srt=90,adj=1,labels=labels,xpd=TRUE,cex=0.5)
text(5*nrow(X)/8,par("usr")[3]-(350/2500)*par("usr")[4],adj=1,labels="SAMPLE_ID",xpd=TRUE)
legend(x=nrow(X)/10,y=par("usr")[4]-(200/2500)*par("usr")[4],c("Mean","25% Quantile","Median","75% Quantile"),col=colors,lty=c(1),cex=0.8,pch=c(3,2,1,2))
dev.off()
}
PlotOnlyMeanMedian <- function(X) {
pdf("Per_Sample_Mean_Median_Only.pdf")
colors <- rainbow(2,start=0.6,end=0.9,gamma=1)
means = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,3])))
medians = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,5])))
plot(means,xlab="",ylab="Depth of Coverage",xaxt="n",col=colors[1],pch=3,type="b",ylim=c(0,max(c(max(means),max(medians)))))
points(medians,col=colors[2],pch=1,type="b")
axis(1,labels=FALSE)
labels <- X[1:nrow(X)-1,1]
text(1:nrow(X)-1,par("usr")[3]-(50/2500)*par("usr")[4],srt=90,adj=1,labels=labels,xpd=TRUE,cex=0.5)
text(5*nrow(X)/8,par("usr")[3]-(350/2500)*par("usr")[4],adj=1,labels="SAMPLE_ID",xpd=TRUE)
legend(x=nrow(X)/10,y=par("usr")[4]-(200/2500)*par("usr")[4],c("Mean","Median"),col=colors,lty=c(1),cex=0.8,pch=c(3,2))
dev.off()
}
PlotOnlyQuartiles <- function(X) {
pdf("Per_Sample_Quartiles_Only.pdf")
colors <- rainbow(2,start=0.6,end=0.9,gamma=1)
thirdQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,4])))
firstQ = as.numeric(as.matrix(unlist(X[1:nrow(X)-1,6])))
plot(thirdQ,xlab="",ylab="Depth of Coverage",xaxt="n",col=colors[1],pch=3,type="b",ylim=c(0,max(thirdQ)))
points(firstQ,col=colors[2],pch=2,type="b")
axis(1,labels=FALSE)
labels <- X[1:nrow(X)-1,1]
text(1:nrow(X)-1,par("usr")[3]-(50/2500)*par("usr")[4],srt=90,adj=1,labels=labels,xpd=TRUE,cex=0.5)
text(5*nrow(X)/8,par("usr")[3]-(350/2500)*par("usr")[4],adj=1,labels="SAMPLE_ID",xpd=TRUE)
legend(x=nrow(X)/10,y=par("usr")[4]-(200/2500)*par("usr")[4],c("75% Quantile","25% Quantile"),col=colors,lty=c(1),cex=0.8,pch=c(3,2))
dev.off()
}
## PLOT SAMPLE STATISTICS
TO_PLOT <- read.table(sampleStats)
PlotDepths(TO_PLOT)
PlotLocusQuantiles(TO_PLOT)
## PLOT SAMPLE SUMMARY
TO_PLOT <- read.table(sampleSum,header=TRUE)
PlotMeanMedianQuartiles(TO_PLOT)
PlotOnlyMeanMedian(TO_PLOT)
PlotOnlyQuartiles(TO_PLOT)
HistogramMedians(TO_PLOT)
## PLOT LOCUS STATISTICS
TO_PLOT <- read.table(locusStats)
HeatmapLocusTable(TO_PLOT)

View File

@ -1,254 +0,0 @@
library(ellipse);
library(hexbin);
getAnnIndex <- function(d, ann) {
index = -1;
for (i in c(1:length(names(d)))) {
if (names(d)[i] == ann) {
index = i;
}
}
index;
}
getClusterAnnIndex <- function(c, ann) {
index = -1;
for (i in c(1:length(c[[1]]$anns))) {
if (c[[1]]$anns[i] == ann) {
index = i;
}
}
index;
}
plotAnn <- function(d.known, d.novel, d.loci, ann) {
index = getAnnIndex(d.known, ann);
k = hist(d.known[,index], breaks=100, plot=FALSE);
n = hist(d.novel[,index], breaks=100, plot=FALSE);
plot(k$mids, k$density, type="b", col="blue", ylim=c(0, max(k$density)), lwd=2, xlab=ann, ylab="Density", bty="n");
points(n$mids, n$density, type="b", col="red", lwd=2);
if (!is.na(d.loci)) {
legend("topright", c("Known", "Novel", "Suspicious loci"), col=c("blue", "red", "yellow3"), pch=c(21, 21, 18));
} else {
legend("topright", c("Known", "Novel"), col=c("blue", "red"), pch=21);
}
if (!is.na(d.loci)) {
for (i in c(1:nrow(d.loci))) {
points(d.loci[i, index], 0, col="yellow3", pch=18, cex=2.0);
}
}
}
read.clusters <- function(filename) {
con = file(filename, "r", blocking = FALSE)
lines = readLines(con)
close(con);
anns = c();
annIndex = 1;
clusterIndex = 1;
clusters = c();
conversions = c();
for (line in lines) {
if (length(grep("ANNOTATION", line)) > 0) {
linePieces = unlist(strsplit(line, ","));
anns = c(anns, linePieces[2]);
conversions[[annIndex]] = list(ann = linePieces[2], offset = as.numeric(linePieces[3]), multiplier = as.numeric(linePieces[4]));
annIndex = annIndex + 1;
} else if (length(grep("CLUSTER", line)) > 0) {
linePieces = unlist(strsplit(line, ","));
mixtureWeight = linePieces[2];
mu = linePieces[3:(3+length(anns)-1)];
cov = linePieces[(3+length(anns)):length(linePieces)];
clusters[[clusterIndex]] = list(
anns = anns,
conversions = conversions,
mixtureWeight = as.numeric(mixtureWeight),
means = as.numeric(mu),
cov = matrix(cov, nrow=length(anns), ncol=length(anns))
);
clusterIndex = clusterIndex + 1;
}
}
clusters;
}
clusterLimits <- function( vals, defaultMin, defaultMax ) {
x = c(max(defaultMin, min(vals, -2)), min(defaultMax, max(vals, 2)))
print(x)
x
}
getClusterColor <- function(clusterIndex, nClusters) {
clusterColors(nClusters)[clusterIndex]
}
clusterColors <- function(nClusters) {
rainbow(nClusters)
}
makeAxis <- function( num, vals, off1, mult1, xmin, xmax ) {
#labels=as.integer(seq(from=min(vals), to=max(vals), by=(abs(min(vals)) + abs(max(vals)))/5))
#at=seq(from=min((vals - off1)/mult1), to=max((vals - off1)/mult1), by=(abs(min((vals - off1)/mult1)) + abs(max((vals - off1)/mult1)))/5)
#from = xmin * mult1 + off1
#to = xmax * mult1 + off1
#print(list(off1=off1, mult1=mult1, xmin=xmin, xmax=xmax))
at = as.integer(seq(from=xmin, to=xmax, by=(abs(xmin) + abs(xmax))/5))
labels = as.integer(at * mult1 + off1)
#print(list(from=from, to=to, by=(abs(from) + abs(to))/5))
#print(list(labels=labels, at=at))
axis(num, labels=labels, at=at);
# axis(num,
# labels=as.integer(seq(from=min(vals), to=max(vals), by=(abs(min(vals)) + abs(max(vals)))/5)),
# at=seq(from=min((vals - off1)/mult1), to=max((vals - off1)/mult1), by=(abs(min((vals - off1)/mult1)) + abs(max((vals - off1)/mult1)))/5)
# );
}
plotClusters <- function(d.known, d.novel, d.loci, c, ann1, ann2, filename, maxVariants = -1) {
index1 = getAnnIndex(d.known, ann1);
index2 = getAnnIndex(d.known, ann2);
cindex1 = getClusterAnnIndex(c, ann1);
cindex2 = getClusterAnnIndex(c, ann2);
mult1 = c[[1]]$conversions[[cindex1]]$multiplier;
off1 = c[[1]]$conversions[[cindex1]]$offset;
mult2 = c[[1]]$conversions[[cindex2]]$multiplier;
off2 = c[[1]]$conversions[[cindex2]]$offset;
xvalsForLims = clusterLimits(d.known[,index1], -4, 4)
yvalsForLims = clusterLimits(d.known[,index2], -4, 4)
xlims = c(min(xvalsForLims), 1.2*max(xvalsForLims));
ylims = c(min(yvalsForLims), max(yvalsForLims));
# par(mar=c(5, 6, 2, 5));
plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=xlims, ylim=ylims, xlab=ann1, ylab=ann2, bty="n");
mv.known = if (maxVariants == -1 | maxVariants >= nrow(d.known)) { seq(1, nrow(d.known)) } else { as.integer(runif(maxVariants, 1, nrow(d.known)+1))}
mv.novel = if (maxVariants == -1 | maxVariants >= nrow(d.novel)) { 1:nrow(d.novel) } else { as.integer(runif(maxVariants, 1, nrow(d.novel)+1)) }
print(dim(mv.known))
print(maxVariants)
points(((d.known[,index1] - off1)/mult1)[mv.known], ((d.known[,index2] - off2)/mult2)[mv.known], pch=19, cex=0.3, col="#0000FF33");
points(((d.novel[,index1] - off1)/mult1)[mv.novel], ((d.novel[,index2] - off2)/mult2)[mv.novel], pch=19, cex=0.3, col="#FF000033");
nClusters = length(c)
for (clusterIndex in c(1:nClusters)) {
mu = c(c[[clusterIndex]]$means[cindex1], c[[clusterIndex]]$means[cindex2]);
cov = matrix(as.numeric(
matrix(
c(
c[[clusterIndex]]$cov[cindex1,cindex1],
c[[clusterIndex]]$cov[cindex2,cindex1],
c[[clusterIndex]]$cov[cindex1,cindex2],
c[[clusterIndex]]$cov[cindex2,cindex2]
),
nrow=2, ncol=2
)
), nrow=2, ncol=2
);
weight = c[[clusterIndex]]$mixtureWeight;
color = getClusterColor(clusterIndex, nClusters);
lineweight = ifelse(weight > 0.50, 4, 3);
points(mu[1], mu[2], pch=21, col=color, cex=0.5);
points(ellipse(t(cov), centre=mu), type="l", lwd=lineweight, col=color);
}
makeAxis(1, d.novel[,index1], off1, mult1, xvalsForLims[1], xvalsForLims[2])
makeAxis(2, d.novel[,index2], off2, mult2, yvalsForLims[1], yvalsForLims[2])
# add points legend on the lower left
if (!is.na(d.loci)) {
legend("bottomleft", c("Known", "Novel", "Suspicious loci"), col=c("blue", "red", "yellow3"), pch=19);
} else {
legend("bottomleft", c("Known", "Novel"), col=c("blue", "red"), pch=19);
}
# add upper right legend with cluster id and weights
weights = round(sapply(c, function(x) x$mixtureWeight),2)
clusterNames = paste("C", paste(1:nClusters), sep="")
clusterLegendNames = paste(clusterNames, weights, sep="-W=")
legend("topright", clusterLegendNames, fill=clusterColors(nClusters))
if (!is.na(d.loci)) {
points((d.loci[,index1] - off1)/mult1, (d.loci[,index2] - off2)/mult2, pch=19, cex=0.8, col="yellow3");
}
}
args = commandArgs(TRUE);
plotRoot = args[1];
if (is.na(plotRoot)) { plotRoot = "test"; }
clusterFile = args[2];
if (is.na(clusterFile)) { clusterFile = "/Volumes/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v8/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized"; }
vcfTable = args[3];
if (is.na(vcfTable)) { vcfTable = "/Volumes/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v8/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized.table"; }
lociFile = args[4];
if (is.na(lociFile) | lociFile == "NA" ) { lociFile = NA; }
maxVariants = args[5];
if (is.na(maxVariants)) { maxVariants = 5000; }
maxVariants = as.integer(maxVariants)
greedy = args[6]
if (is.na(greedy)) { greedy = -1; }
greedy = as.integer(greedy)
l = c();
if (!is.na(lociFile)) {
t = read.table(lociFile, header=TRUE);
l = t$POS;
}
print("Greedy reading")
d = read.table(vcfTable, header=TRUE, nrows = greedy);
c = read.clusters(clusterFile);
d.known = d[which(d$DB == 1 | d$ID != "."),];
d.novel = d[which(d$DB == 0 | d$ID == "."),];
d.loci = NA;
if (length(l) > 0) {
d.loci = d[which(d$POS %in% l),];
}
pdf(paste(plotRoot, ".clusterReport.pdf", sep=""));
for (ann1 in c[[1]]$anns) {
print(ann1)
plotAnn(d.known, d.novel, d.loci, ann1);
for (ann2 in c[[1]]$anns) {
if (ann1 != ann2) {
print(paste("-- v ", ann2))
plotClusters(d.known, d.novel, d.loci, c, ann1, ann2, maxVariants=maxVariants);
}
}
}
dev.off();

View File

@ -1,442 +0,0 @@
suppressPackageStartupMessages(library(gsalib));
suppressPackageStartupMessages(library(gplots));
eval.getMetrics <- function(eval, jexl_expression) {
callset.counts = eval$CountVariants[which(eval$CountVariants$evaluation_name == "eval" & eval$CountVariants$comparison_name == "dbsnp" & eval$CountVariants$jexl_expression == jexl_expression),];
callset.counts.titv = eval$TiTv[which(eval$TiTv$evaluation_name == "eval" & eval$TiTv$comparison_name == "dbsnp" & eval$TiTv$jexl_expression == jexl_expression),];
callset.calledCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "all"),]$nVariantLoci;
callset.calledCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio;
callset.knownCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "known"),]$nVariantLoci;
callset.knownCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio;
callset.novelCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "novel"),]$nVariantLoci;
callset.novelCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio;
callset.allFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "all"),]$nVariantLoci;
callset.allFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio;
callset.knownFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "known"),]$nVariantLoci;
callset.knownFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio;
callset.novelFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "novel"),]$nVariantLoci;
callset.novelFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio;
metrics = list(
all = callset.calledCounts,
all.titv = callset.calledCounts.titv,
known = callset.knownCounts,
known.titv = callset.knownCounts.titv,
novel = callset.novelCounts,
novel.titv = callset.novelCounts.titv,
filtered.all = callset.allFilteredCounts,
filtered.all.titv = callset.allFilteredCounts.titv,
filtered.known = callset.knownFilteredCounts,
filtered.known.titv = callset.knownFilteredCounts.titv,
filtered.novel = callset.novelFilteredCounts,
filtered.novel.titv = callset.novelFilteredCounts.titv
);
}
.plot.callsetConcordance.getLabelText <- function(name, othername, metrics, filtered.metrics=NA, union) {
if (is.na(filtered.metrics)) {
text = sprintf("%s (%0.01f%% of union)\nCalled:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f",
name, 100*metrics$all/union$all.withfiltered,
metrics$all, metrics$all.titv,
metrics$known, metrics$known.titv,
metrics$novel, metrics$novel.titv
);
} else {
text = sprintf("%s (%0.01f%% of union)\nCalled in %s, filtered in %s:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f\n\nCalled in %s, absent in %s:\nAll: %d, Ti/Tv: %0.2f\nKnown: %d, Ti/Tv: %0.2f\nNovel: %d, Ti/Tv: %0.2f",
name, 100*(metrics$all + filtered.metrics$all)/union$all.withfiltered,
name, othername,
filtered.metrics$all, filtered.metrics$all.titv,
filtered.metrics$known, filtered.metrics$known.titv,
filtered.metrics$novel, filtered.metrics$novel.titv,
name, othername,
metrics$all, metrics$all.titv,
metrics$known, metrics$known.titv,
metrics$novel, metrics$novel.titv
);
}
}
plot.titlePage <- function(title, author) {
textplot(sprintf("Automated Variant Report\n\n%s\n%s\n%s\n", title, author, Sys.Date()));
}
.plot.variantTable.getRowText <- function(eval, jexl_expression) {
allVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "all"),]$nVariantLoci;
knownVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "known"),]$nVariantLoci;
novelVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == jexl_expression & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "novel"),]$nVariantLoci;
allTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "all"),]$ti.tv_ratio;
knownTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "known"),]$ti.tv_ratio;
novelTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == jexl_expression & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "novel"),]$ti.tv_ratio;
cbind(allVariants, knownVariants, sprintf("%0.2f", knownTiTv), novelVariants, sprintf("%0.2f", novelTiTv));
}
plot.variantTable <- function(eval, title) {
aonly.row = .plot.variantTable.getRowText(eval, eval$CallsetOnlyNames[1]);
aonly.filtered.row = .plot.variantTable.getRowText(eval, eval$CallsetFilteredNames[1]);
intersection.row = .plot.variantTable.getRowText(eval, "Intersection");
bonly.row = .plot.variantTable.getRowText(eval, eval$CallsetOnlyNames[2]);
bonly.filtered.row = .plot.variantTable.getRowText(eval, eval$CallsetFilteredNames[2]);
variantsummary = as.data.frame(rbind(bonly.row, bonly.filtered.row, intersection.row, aonly.filtered.row, aonly.row));
rownames(variantsummary) = c(
sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
"Intersection",
sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]),
sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2])
);
colnames(variantsummary) = c("counts (all)", "counts (known)", "ti/tv (known)", "counts (novel)", "ti/tv (novel)");
textplot(variantsummary);
}
plot.callsetConcordance <- function(eval, col=c("#FF6342", "#63C6DE", "#ADDE63")) {
aonly = eval.getMetrics(eval, eval$CallsetOnlyNames[1]);
aonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[1]);
intersection = eval.getMetrics(eval, "Intersection");
bonly = eval.getMetrics(eval, eval$CallsetOnlyNames[2]);
bonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[2]);
union = list(
all = intersection$all + aonly$all + bonly$all,
all.withfiltered = intersection$all + aonly$all + bonly$all + aonly.filtered$all + bonly.filtered$all
);
gsa.plot.venn(aonly$all + intersection$all + aonly.filtered$all, bonly$all + intersection$all + bonly.filtered$all, 0, intersection$all, 0, 0, pos=c(0.32, 0.32, 0.68, 0.70), col=col);
text(0, 0.45, cex=1.2, pos=4, .plot.callsetConcordance.getLabelText(eval$CallsetNames[1], eval$CallsetNames[2], aonly, aonly.filtered, union));
text(0.5, 0.75, cex=1.2, adj=c(0.5, 0.33), .plot.callsetConcordance.getLabelText("Intersection", NA, intersection, NA, union));
text(1, 0.45, cex=1.2, pos=2, .plot.callsetConcordance.getLabelText(eval$CallsetNames[2], eval$CallsetNames[1], bonly, bonly.filtered, union));
}
plot.callsetConcordanceByAC <- function(eval, normalize=TRUE, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) {
aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name);
aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]);
intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name);
bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name);
bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]);
title = paste("Callset concordance per allele count (", novelty_name, " variants)", sep="");
if (length(intersection$AC) > 0 && length(aonly$AC) == 0) {
aonly = intersection;
aonly$n = 0;
}
if (length(intersection$AC) > 0 && length(bonly$AC) == 0) {
bonly = intersection;
bonly$n = 0;
}
if (length(intersection$AC) > 0 && length(aonly.filtered$AC) == 0) {
aonly.filtered = intersection;
aonly.filtered$n = 0;
}
if (length(intersection$AC) > 0 && length(bonly.filtered$AC) == 0) {
bonly.filtered = intersection;
bonly.filtered$n = 0;
}
#par.def = par(no.readonly = TRUE);
#par(mar=c(5, 5, 3, 5));
if (normalize == TRUE) {
norm = aonly$n + aonly.filtered$n + intersection$n + bonly$n + bonly.filtered$n;
matnorm = rbind(aonly$n/norm, aonly.filtered$n/norm, intersection$n/norm, bonly.filtered$n/norm, bonly$n/norm);
barplot(matnorm, col=col, xlab="Allele count", ylab="", main=title, names.arg=intersection$AC, xlim=c(1, 1.2*max(intersection$AC)), ylim=c(0, 1.3), border=NA, yaxt="n", cex=1.3, cex.axis=1.3, cex.lab=1.3);
axis(2, at=seq(from=0, to=1, by=0.2), seq(from=0, to=1, by=0.2), cex=1.3, cex.axis=1.3);
mtext("Fraction", side=2, at=0.5, padj=-3.0, cex=1.3);
} else {
mat = rbind(aonly$n, aonly.filtered$n, intersection$n, bonly.filtered$n, bonly$n);
#barplot(mat, col=col, xlab="Allele count", ylab="counts", main=title, names.arg=intersection$AC, xlim=c(1, max(intersection$AC)), ylim=c(0, 1), border=NA, cex=1.3, cex.axis=1.3, cex.lab=1.3);
barplot(mat, col=col, xlab="Allele count", ylab="counts", main=title, names.arg=intersection$AC, xlim=c(1, 1.2*max(intersection$AC)), border=NA, cex=1.3, cex.axis=1.3, cex.lab=1.3);
#axis(2, at=seq(from=0, to=1, by=0.2), seq(from=0, to=1, by=0.2), cex=1.3, cex.axis=1.3);
#mtext("Fraction", side=2, at=0.5, padj=-3.0, cex=1.3);
}
legend(
"topright",
c(
sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
"Intersection",
sprintf("Called in %s, filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]),
sprintf("Called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2])
),
fill=rev(col),
cex=1.3
);
#par(par.def);
}
plot.alleleCountSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) {
aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name);
aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]);
intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name);
intersection.all = eval.getMetrics(eval, "Intersection");
bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name);
bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]);
title = paste("Allele count spectrum (", novelty_name, " variants)", sep="");
if (length(intersection$AC) > 0 && length(aonly$AC) == 0) {
aonly = intersection;
aonly$n = 0;
}
if (length(intersection$AC) > 0 && length(bonly$AC) == 0) {
bonly = intersection;
bonly$n = 0;
}
if (length(intersection$AC) > 0 && length(aonly.filtered$AC) == 0) {
aonly.filtered = intersection;
aonly.filtered$n = 0;
}
if (length(intersection$AC) > 0 && length(bonly.filtered$AC) == 0) {
bonly.filtered = intersection;
bonly.filtered$n = 0;
}
loci = (unique(eval$CountVariants$nProcessedLoci))[1];
ymax = 10*max((1/1000)*loci*(1/c(1:max(intersection$AC))));
suppressWarnings(plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(1, ymax), xlab="Allele count", ylab="Number of variants", main=title, log="xy", bty="n", cex=1.3, cex.lab=1.3, cex.axis=1.3));
suppressWarnings(points(intersection$AC, aonly$n + aonly.filtered$n + intersection$n, type="l", lwd=2, col=col[1]));
suppressWarnings(points(intersection$AC, aonly$n + intersection$n, type="l", lwd=2, lty=2, col=col[1]));
suppressWarnings(points(intersection$AC, intersection$n, type="l", lwd=2, col=col[3]));
suppressWarnings(points(intersection$AC, bonly$n + intersection$n, type="l", lwd=2, lty=2, col=col[4]));
suppressWarnings(points(intersection$AC, bonly$n + bonly.filtered$n + intersection$n, type="l", lwd=2, col=col[5]));
#points(c(1:max(intersection$AC)), 0.9*(1/1000)*loci*(1/c(1:max(intersection$AC))), type="l", lwd=2, lty=2, col="black");
legend(
"bottomleft",
c(
sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
"Intersection",
sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]),
sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2])#,
#sprintf("Neutral expectation ( 0.9*(1/1000)*%0.1f*(1/c(1:max(%d))) )", loci, max(intersection$AC))
),
lwd=c(2, 2, 3, 2, 2, 2),
lty=c(1, 2, 1, 2, 1, 2),
col=c(rev(col), "black"),
cex=1.3
);
}
eval.getMetricsByAc <- function(eval, jexl, novelty="all") {
piece = subset(eval$MetricsByAc,
evaluation_name == "eval" &
comparison_name == "dbsnp" &
as.character(jexl_expression) == as.character(jexl) &
filter_name == "called" &
novelty_name == novelty
);
}
plot.titvSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#FF9675", "#5C92A4", "#88EEFF", "#55BBFF")) {
aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name);
aonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[1]);
intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name);
bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name);
bonly.filtered = eval.getMetricsByAc(eval, eval$CallsetFilteredNames[2]);
title = paste("Ti/Tv spectrum (", novelty_name, " variants)", sep="");
if (length(intersection$AC) > 0 && length(aonly$AC) == 0) {
aonly = intersection;
aonly$n = 0;
aonly$nTi = 0;
aonly$nTv = 0;
}
if (length(intersection$AC) > 0 && length(bonly$AC) == 0) {
bonly = intersection;
bonly$n = 0;
bonly$nTi = 0;
bonly$nTv = 0;
}
if (length(intersection$AC) > 0 && length(aonly.filtered$AC) == 0) {
aonly.filtered = intersection;
aonly.filtered$n = 0;
aonly.filtered$nTi = 0;
aonly.filtered$nTv = 0;
}
if (length(intersection$AC) > 0 && length(bonly.filtered$AC) == 0) {
bonly.filtered = intersection;
bonly.filtered$n = 0;
bonly.filtered$nTi = 0;
bonly.filtered$nTv = 0;
}
titv.aonly.withfiltered = (aonly$nTi + aonly.filtered$nTi + intersection$nTi)/(aonly$nTv + aonly.filtered$nTv + intersection$nTv);
titv.aonly.withfiltered.finite = titv.aonly.withfiltered[which(is.finite(titv.aonly.withfiltered))];
titv.aonly = (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv);
titv.aonly.finite = titv.aonly[which(is.finite(titv.aonly))];
titv.intersection.finite = intersection$Ti.Tv[which(is.finite(intersection$Ti.Tv))];
titv.bonly = (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv);
titv.bonly.finite = titv.bonly[which(is.finite(titv.bonly))];
titv.bonly.withfiltered = (bonly$nTi + bonly.filtered$nTi + intersection$nTi)/(bonly$nTv + bonly.filtered$nTv + intersection$nTv);
titv.bonly.withfiltered.finite = titv.bonly.withfiltered[which(is.finite(titv.bonly.withfiltered))];
titv.min = min(titv.aonly.withfiltered.finite, titv.aonly.finite, titv.intersection.finite, titv.bonly.finite, titv.bonly.withfiltered.finite);
titv.max = max(titv.aonly.withfiltered.finite, titv.aonly.finite, titv.intersection.finite, titv.bonly.finite, titv.bonly.withfiltered.finite);
plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(0, 4), xlab="Allele count", ylab="Transition/transversion (Ti/Tv) ratio", main=title, bty="n", cex=1.3, cex.lab=1.3, cex.axis=1.3);
points(intersection$AC, (aonly.filtered$nTi + intersection$nTi)/(aonly.filtered$nTv + intersection$nTv), type="l", lwd=2, col=col[1]);
points(intersection$AC, (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv), type="l", lwd=2, lty=2, col=col[2]);
points(intersection$AC, intersection$Ti.Tv, type="l", lwd=2, col=col[3]);
points(intersection$AC, (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv), type="l", lwd=2, lty=2, col=col[4]);
points(intersection$AC, (bonly.filtered$nTi + intersection$nTi)/(bonly.filtered$nTv + intersection$nTv), type="l", lwd=2, col=col[5]);
abline(h=2.3, lty=2);
mtext("2.3", side=4, at=2.3, cex=0.9);
abline(h=3.3, lty=2);
mtext("3.3", side=4, at=3.3, cex=0.9);
#legend("topleft", c(eval$CallsetOnlyNames[1], "Intersection", eval$CallsetOnlyNames[2]), fill=col);
legend(
"topleft",
c(
sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[2], eval$CallsetOnlyNames[1]),
"Intersection",
sprintf("Intersection + called in %s, absent in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2]),
sprintf("Intersection + called in %s, absent or filtered in %s", eval$CallsetOnlyNames[1], eval$CallsetOnlyNames[2])
),
lwd=c(2, 2, 3, 2, 2),
lty=c(1, 2, 1, 2, 1),
col=rev(col),
cex=1.3
);
}
plot.variantsPerSample2 <- function(eval) {
if (!is.na(eval$MetricsBySample)) {
metrics.all = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "all"),];
metrics.known = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "known"),];
metrics.novel = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "novel"),];
title = "Calls per sample";
indices = order(metrics.all$nVariants, decreasing=TRUE);
plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics.all$sample)), ylim=c(0, max(metrics.all$nVariants)), xlab="", ylab="Number of variants", main=title, bty="n");
points(c(1:length(metrics.all$sample)), (metrics.all$nVariants)[indices], pch=21, col="black");
points(c(1:length(metrics.known$sample)), (metrics.known$nVariants)[indices], pch=21, col="blue");
points(c(1:length(metrics.novel$sample)), (metrics.novel$nVariants)[indices], pch=21, col="red");
legend("topright", c("All", "Known", "Novel"), pch=21, col=c("black", "blue", "red"));
axis(1, at=c(1:length(metrics.all$sample)), labels=(metrics.all$sample)[indices], las=2, cex.axis=0.4);
}
}
plot.variantsPerSample <- function(eval, novelty_name="all") {
if (!is.na(eval$SimpleMetricsBySample)) {
metrics = eval$SimpleMetricsBySample[which(eval$SimpleMetricsBySample$evaluation_name == "eval" & eval$SimpleMetricsBySample$comparison_name == "dbsnp" & as.character(eval$SimpleMetricsBySample$jexl_expression) == "none" & eval$SimpleMetricsBySample$filter_name == "called" & eval$SimpleMetricsBySample$novelty_name == novelty_name),];
title = paste("Calls per sample (", novelty_name, ")", sep="");
indices = order(metrics$CountVariants, decreasing=TRUE);
par.def = par(no.readonly = TRUE);
par(mar=c(5, 4, 4, 4));
plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics$row)), ylim=c(0, max(metrics$CountVariants)), xlab="", ylab="Number of variants", main=title, bty="n");
points(c(1:length(metrics$row)), (metrics$CountVariants)[indices], pch=21, col="black");
axis(1, at=c(1:length(metrics$row)), labels=(metrics$row)[indices], las=2, cex.axis=0.4);
par(new=TRUE);
plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=c(1, length(metrics$row)), ylim=c(min(metrics$TiTvRatio), 1.2*max(metrics$TiTvRatio)), xlab="", ylab="", main=title, bty="n");
points(c(1:length(metrics$row)), (metrics$TiTvRatio)[indices], pch=19, col="black");
titvaxis = c(min(metrics$TiTvRatio), max(metrics$TiTvRatio));
axis(4, at=titvaxis, labels=titvaxis, las=2);
par(par.def);
}
}
argspec = list(
evalRoot = list(value = NA, doc = "Path to the VariantEval R-output (omit the '.Analysis_Type.csv' part of the filename)"),
plotOut = list(value = NA, doc = "Path to the output PDF file"),
title = list(value = NA, doc = "The title of the report"),
author = list(value = NA, doc = "The author of the report")
);
cmdargs = gsa.getargs(argspec, doc="Take VariantEval R-output and generate a series of plots summarizing the contents");
eval = gsa.read.eval(cmdargs$evalRoot);
pdf(cmdargs$plotOut, width=10, height=10);
plot.titlePage(cmdargs$title, cmdargs$author);
plot.variantTable(eval);
if (length(eval$CallsetNames) > 0) {
# Venn diagram
plot.callsetConcordance(eval);
# Venn by AC (normalized)
plot.callsetConcordanceByAC(eval, novelty_name="all");
plot.callsetConcordanceByAC(eval, novelty_name="known");
plot.callsetConcordanceByAC(eval, novelty_name="novel");
# Venn by AC (unnormalized)
plot.callsetConcordanceByAC(eval, novelty_name="all", normalize=FALSE);
plot.callsetConcordanceByAC(eval, novelty_name="known", normalize=FALSE);
plot.callsetConcordanceByAC(eval, novelty_name="novel", normalize=FALSE);
# Allele count spectrum
plot.alleleCountSpectrum(eval, novelty_name="all");
plot.alleleCountSpectrum(eval, novelty_name="known");
plot.alleleCountSpectrum(eval, novelty_name="novel");
# Ti/Tv spectrum
plot.titvSpectrum(eval, novelty_name="all");
plot.titvSpectrum(eval, novelty_name="known");
plot.titvSpectrum(eval, novelty_name="novel");
# Per-sample
#plot.variantsPerSample(eval);
} else {
#plot.variantsPerSample(eval, novelty_name="all");
#plot.variantsPerSample(eval, novelty_name="known");
#plot.variantsPerSample(eval, novelty_name="novel");
}
dev.off();

View File

@ -1,61 +0,0 @@
#!/usr/bin/env Rscript
args <- commandArgs(TRUE)
base_name = args[1]
input = args[2]
d <- read.table(input, header=T)
# separate the data into filtered and unfiltered
d.filtered <- d[d$filter_type=="filtered",]
d.unfiltered <- d[d$filter_type=="unfiltered",]
if (nrow(d.filtered) > 0) {
d.display <- d.filtered
} else {
d.display <- d.unfiltered
}
#
# Plot histograms of the known versus novel Ti/Tv
#
outfile = paste(base_name, ".histograms.png", sep="")
if (nrow(d.filtered) > 0) {
nFilterTypes <- 2
} else {
nFilterTypes <- 1
}
bitmap(outfile, width=600, height=(300 * nFilterTypes), units="px")
par(cex=1.1, mfrow=c(1 * nFilterTypes,2))
nbreaks <- 20
color <- "grey"
xlim <- c(0,4)
hist(d.unfiltered$known_titv, nbreaks, col=color, xlim=xlim)
hist(d.unfiltered$novel_titv, nbreaks, col=color, xlim=xlim)
if (nrow(d.filtered) > 0) {
hist(d.filtered$known_titv, nbreaks, col=color, xlim=xlim)
hist(d.filtered$novel_titv, nbreaks, col=color, xlim=xlim)
}
dev.off()
#
# Plot samples in order of novel Ti/Tv versus known Ti/Tv
#
outfile = paste(base_name, ".novel_vs_known_titv.png", sep="")
bitmap(outfile, width=600, height=600, units="px")
d.display <- d.display[order(d.display$novel_titv),]
plot(1:length(d.display$known_titv),d.display$known_titv,type="b",col="blue",ylim=c(0,4), xlab="Sample #", ylab="Ti / Tv")
points(1:length(d.display$novel_titv),d.display$novel_titv,type="b",col="red",ylim=c(0,4))
legend("bottomright", c("known","novel"), col=c("blue","red"), pch=21)
dev.off()

View File

@ -1,15 +0,0 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
d = read.table(args[1],head=T)
outfile = args[2]
title = args[3]
# -----------------------------------------------------------------------------------------------
# plot timing
# -----------------------------------------------------------------------------------------------
pdf(outfile, height=5, width=8)
boxplot(d$walltime ~ d$operation, ylab = "Elapsed wall time in seconds [Log10 Scale]", log="y", main=title, cex.axis=0.75)
dev.off()

View File

@ -1,72 +0,0 @@
require("plotrix")
args = commandArgs(TRUE);
onCMDLine = ! is.na(args[1])
file = "sim_calls.table"
info = "interactive R"
if ( onCMDLine ) {
file = args[1]
d <- read.table(file, header=T)
pdf(args[2])
info = args[3]
}
d$sim.VAR <- d$sim.AC > 0
d$called.VAR <- d$called.AC > 0
QS = unique(d$sim.Q)
MODES = unique(d$sim.MODE)
NS = unique(d$called.AN / 2)
DEPTHS = unique(d$sim.DP)
addSection <- function(name) {
par("mar", c(5, 4, 4, 2))
frame()
title(name, cex=2)
}
addSection(paste("Calling performance report: nSamples = ", NS, "\n info:", info))
results <- expand.grid(Q = QS, mode = MODES, nSamples = NS, depth = DEPTHS)
results$sensitivity = 0
results$specificity = 0
determineRates <- function(raw, Q, mode, depth) {
sub <- subset(raw, sim.Q == Q & sim.MODE == mode & sim.DP == depth)
print(c(Q,mode,depth, dim(sub)))
ct <- table(sub$called.VAR, sub$sim.VAR, dnn = c("called.VAR", "sim.VAR"), useNA = "always")
print(ct)
sensitivity = ct[2,2] / sum(ct[,2])
specificity = ct[1,1] / sum(ct[,1])
list(sensitivity = sensitivity, specificity = specificity, ct = ct)
}
for ( i in 1:(dim(results)[1]) ) {
r <- results[i,]
x <- determineRates(d, r$Q, r$mode, r$depth)
results[i,]$sensitivity = x$sensitivity
results[i,]$specificity = x$specificity
}
for ( depth in DEPTHS ) {
boxplot(called.AC ~ sim.AC, data = subset(d, called.DP == depth * NS), main = paste("Depth of coverage ", depth), xlab = "Simulation AC", ylab = "Called AC", outwex=0.5, col = "cornflowerblue")
abline(a=0,b=1,col="red",lwd=3)
}
print(results)
par(mfcol=c(2,1))
for ( Qt in QS ) {
x <- subset(results, Q == Qt)
print(x)
plot(x$depth, x$sensitivity, type="b", main = paste("Q score", Qt), xlab = "Depth", ylab="Sensitivity")
plot(x$depth, x$specificity, type="b", xlab = "Depth", ylab="Specificity")
}
par(mfcol=c(1,1))
plot(0,0, type="n", frame.plot=F, ann=F, axes=F)
addtable2plot(-1, -1, data.frame(Q=results$Q, mode=results$mode, depth=results$depth, sensitivity=format(results$sensitivity, digits=2), specificity = format(results$specificity, digits=2)))
if ( onCMDLine ) dev.off()

View File

@ -1,58 +0,0 @@
args = commandArgs(TRUE)
onCMDLine = ! is.na(args[1])
if ( onCMDLine ) {
reference_dataset = '/Users/mhanna/metrics.perSample.formatted.table'
inputTSV = args[1]
outputPDF = args[2]
} else {
reference_dataset = '/Users/mhanna/metrics.perSample.formatted.table'
inputTSV = 'GoT2D_exomes_batch_005.tsv'
outputPDF = 'T2D.pdf'
}
require('ggplot2')
data <- read.table(inputTSV,header=T)
complete <- read.table(reference_dataset,header=T)
novel <- subset(complete,exon_intervals == "whole_exome_agilent_1.1_refseq_plus_3_boosters"&Novelty=="novel"&FunctionalClass=="all")
selected_samples <- novel$Sample %in% data$sample
novel_with_highlights <- cbind(novel,selected_samples)
if(onCMDLine) {
fingerprint_lods = list()
for(i in 1:nrow(data)) {
fingerprint_lods[[as.character(data$sample[i])]] <- eval(parse(text=data$FINGERPRINT_LODS[i]))
}
fingerprint_lod_order = order(unlist(lapply(fingerprint_lods,median),use.names=F))
pdf(outputPDF)
boxplot(fingerprint_lods[fingerprint_lod_order],las=3,main='Fingerprint LOD Scores By Sample',xlab='Sample',ylab='LOD Score Distribution',cex.axis=0.65)
qplot(Sample,Selected_Bases_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='On+Near Bait Bases/PF Bases Aligned per Sample')
qplot(Sample,Mean_Target_Coverage,data=novel_with_highlights,color=selected_samples) + opts(title='Mean Target Coverage per Sample')
qplot(Sample,Zero_Coverage_Targets_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% of Targets with <2x Coverage per Sample')
qplot(Sample,Fold_80_Base_Penalty,data=novel_with_highlights,color=selected_samples) + opts(title='Fold 80 Base Penalty per Sample')
qplot(Sample,Target_Bases_20x_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% Target Bases Achieving >20x Coverage per Sample')
qplot(Sample,PF_Reads_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned per Sample')
qplot(Sample,PF_HQ_Error_Rate,data=novel_with_highlights,color=selected_samples) + opts(title='% HQ Bases mismatching the Reference per Sample')
qplot(Sample,Mean_Read_Length,data=novel_with_highlights,color=selected_samples) + opts(title='Median Read Length per Sample')
qplot(Sample,Bad_Cycles,data=novel_with_highlights,color=selected_samples) + opts(title='# Bad Cycles per Sample')
qplot(Sample,Strand_Balance_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned to the + Strand per Sample')
qplot(Sample,Total_SNPs,data=novel_with_highlights,color=selected_samples) + opts(title='# SNPs called per Sample')
qplot(Sample,dbSNP_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% SNPs in dbSNP per Sample')
qplot(PCT_DBSNP,data=data,geom="histogram") + opts(title='% SNPs in dbSNP per Sample')
dev.off()
} else {
print('Plotting command-line arguments')
qplot(Sample,PF_Reads_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned per Sample')
}
#qplot(Sample,Library_Size_HS,data=novel_with_highlights,color=selected_samples) + opts(title='Hybrid Sequencing Library Size per Sample')
#qplot(Sample,MEDIAN_INSERT_SIZE,data=novel_with_highlights,color=selected_samples) + opts(title='Median Insert Size per Sample')
#qplot(Sample,PCT_CHIMERAS,data=novel_with_highlights,color=selected_samples) + opts(title='% Chimera Read Pairs per Sample')
#qplot(Sample,PCT_ADAPTER,data=novel_with_highlights,color=selected_samples) + opts(title='% Unaligned Reads Matching an Adapter Sequence per Sample')
#qplot(Sample,NOVEL_SNPS,data=novel_with_highlights,color=selected_samples) + opts(title='# Novel SNPs called per Sample')
#qplot(Sample,DBSNP_TITV,data=novel_with_highlights,color=selected_samples) + opts(title='TiTv of SNPs in dbSNP per Sample')

View File

@ -1,298 +0,0 @@
library("gsalib", lib.loc="/Users/depristo/Desktop/broadLocal/GATK/trunk/R/")
require("ggplot2")
require("gplots")
# TODOs:
# Assumes you have indels in your call set. If not you will get errors
# Create pre/post calling sections
# Allow conditional use of the preQCFile (where it's not available)
args = commandArgs(TRUE)
onCMDLine = ! is.na(args[1])
LOAD_DATA = T
# creates an array of c(sampleName1, ..., sampleNameN)
parseHighlightSamples <- function(s) {
return(unlist(strsplit(s, ",", fixed=T)))
}
preQCFile = NA
if ( onCMDLine ) {
ProjectName = args[1]
VariantEvalRoot = args[2]
outputPDF = args[3]
if ( ! is.na(args[4]) )
preQCFile = args[4]
if ( ! is.na(args[5]) )
highlightSamples = parseHighlightSamples(args[5])
else
highlightSamples = c()
} else {
ProjectName = "InDevelopmentInR"
preQCFile <- NA # "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005_per_sample_metrics.tsv"
#VariantEvalRoot <- "qcTestData//ESPGO_Gabriel_NHLBI_eomi_june_2011_batch1"
VariantEvalRoot <- "qcTestData/MC_Engle_11_Samples_06092011"
outputPDF = "bar.pdf"
highlightSamples = c() # parseHighlightSamples("29029,47243")
}
print("Report")
print(paste("Project :", ProjectName))
print(paste("VariantEvalRoot :", VariantEvalRoot))
print(paste("outputPDF :", outputPDF))
print(paste("preQCFile :", preQCFile))
print(paste("highlightSamples :", highlightSamples))
expandVEReport <- function(d) {
d$TiTvVariantEvaluator$tiTvRatio = round(d$TiTvVariantEvaluator$tiTvRatio,2)
d$CountVariants$deletionInsertionRatio = round(d$CountVariants$deletionInsertionRatio,2)
d$CountVariants$nIndels = d$CountVariants$nInsertions + d$CountVariants$nDeletions
return(d)
}
# -------------------------------------------------------
# Utilities for displaying multiple plots per page
# -------------------------------------------------------
# Viewport (layout 2 graphs top to bottom)
distributePerSampleGraph <- function(distgraph, perSampleGraph, heights = c(2,1)) {
Layout <- grid.layout(nrow = 2, ncol = 1, heights=heights)
grid.newpage()
pushViewport(viewport(layout = Layout))
subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1)
print(perSampleGraph, vp = subplot(1))
print(distgraph, vp = subplot(2))
}
createMetricsBySites <- function(VariantEvalRoot, PreQCMetrics) {
# Metrics by sites:
# bySite -> counts of SNPs and Indels by novelty, with expectations
# byAC -> snps and indels (known / novel)
r = list( bySite = expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".summary.eval", sep=""))),
byAC = gsa.read.gatkreport(paste(VariantEvalRoot, ".byAC.eval", sep="")))
r$byAC$CountVariants$nIndels = r$byAC$CountVariants$nInsertions + r$byAC$CountVariants$nDeletions
r$byAC$TiTvVariantEvaluator$nSNPs = r$byAC$TiTvVariantEvaluator$nTi + r$byAC$TiTvVariantEvaluator$nTv
r$byAC$CountVariants$AC = r$byAC$CountVariants$AlleleCount
r$byAC$TiTvVariantEvaluator$AC = r$byAC$TiTvVariantEvaluator$AlleleCount
return(r)
}
summaryTable <- function(metricsBySites, metricsBySample) {
# SNP summary statistics
merged = merge(metricsBySites$bySite$CountVariants, metricsBySites$bySite$TiTvVariantEvaluator)
sub <- subset(merged, FunctionalClass=="all")
raw = melt(sub, id.vars=c("Novelty"), measure.vars=c("nProcessedLoci", "nSNPs", "tiTvRatio", "nIndels", "deletionInsertionRatio"))
table = cast(raw, Novelty ~ ...)
# doesn't work with textplot
colnames(table) <- c("Novelty", "Target size (bp)", "No. SNPs", "Ti/Tv", "No. Indels", "deletion/insertion ratio")
return(table)
}
sampleSummaryTable <- function(metricsBySample) {
# SNP summary statistics
raw <- melt(metricsBySamples, id.vars=c("Novelty", "Sample"), measure.vars=c("nProcessedLoci", "nSNPs", "tiTvRatio", "nIndels", "deletionInsertionRatio"))
table = cast(raw, Novelty ~ variable, mean)
table$nSNPs <- round(table$nSNPs, 0)
table$nIndels <- round(table$nIndels, 0)
table$tiTvRatio <- round(table$tiTvRatio, 2)
table$deletionInsertionRatio <- round(table$deletionInsertionRatio, 2)
colnames(table) <- c("Novelty", "Target size (bp)", "No. SNPs", "Ti/Tv", "No. Indels", "deletion/insertion ratio")
return(table)
}
overallSummaryTable <- function(metricsBySites, metricsBySamples) {
sitesSummary <- as.data.frame(summaryTable(metricsBySites, metricsBySamples))
sitesSummary$Metric.Type <- "Sites"
sampleSummary <- as.data.frame(sampleSummaryTable(metricsBySamples))
sampleSummary$Metric.Type <- "Per-sample avg."
# that last item puts the metric.type second in the list
return(rbind(sitesSummary, sampleSummary)[, c(1,7,2,3,4,5,6)])
}
summaryPlots <- function(metricsBySites) {
name = "SNP and Indel count by novelty and allele frequency"
molten = melt(subset(metricsBySites$byAC$CountVariants, Novelty != "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(c("nSNPs", "nIndels")))
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty, fill=Novelty), group=variable)
p <- p + opts(title = name)
p <- p + scale_y_log10("Number of variants")
p <- p + geom_point(alpha=0.5, size=3)
p <- p + geom_line(size=1)
p <- p + facet_grid(variable ~ ., scales="free")
p <- p + scale_x_continuous("Allele count (AC)")
p2 <- p + scale_x_log10("Allele count (AC)")
p2 <- p2 + opts(title = "")
distributePerSampleGraph(p2, p, c(1,1))
# Counts vs. Allele frequency
name = "Variant counts by allele count"
for ( measure in c("nSNPs", "nIndels")) {
molten = melt(subset(metricsBySites$byAC$CountVariants, AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(measure))
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty), group=variable)
p <- p + opts(title = paste(name, ":", measure))
p <- p + scale_y_log10("Number of variants")
p <- p + scale_x_log10("Allele count (AC)")
p <- p + geom_point(alpha=0.5, size=4)
p <- p + geom_smooth(aes(weight=value), size=1, method="lm", formula = y ~ x)
p <- p + facet_grid(Novelty ~ ., scales="free")
print(p)
}
name = "Transition / transversion ratio by allele count"
# nSNPs > 0 => requires that we have some data here, otherwise Ti/Tv is zero from VE
minSNPsToInclude = 0
byACNoAll = subset(metricsBySites$byAC$TiTvVariantEvaluator, Novelty != "all" & AC > 0 & nSNPs > minSNPsToInclude)
p <- ggplot(data=byACNoAll, aes(x=AC, y=tiTvRatio, color=Novelty))
p <- p + scale_y_continuous("Transition / transversion ratio", limits=c(0,4))
p <- p + opts(title = name)
p <- p + geom_smooth(size=2)
p <- p + geom_point(aes(size=log10(nSNPs), weight=nSNPs), alpha=0.5)
p <- p + scale_x_continuous("Allele count (AC)")
p2 <- p + scale_x_log10("Allele count (AC)")
p2 <- p2 + opts(title = "")
distributePerSampleGraph(p2, p, c(1,1))
# SNPs to indels ratio by allele frequency
name = "SNPs to indels ratio by allele frequency"
metricsBySites$byAC$CountVariants$SNP.Indel.Ratio = metricsBySites$byAC$CountVariants$nSNPs / metricsBySites$byAC$CountVariants$nIndels
metricsBySites$byAC$CountVariants$SNP.Indel.Ratio[metricsBySites$byAC$CountVariants$nIndels == 0] = NaN
p <- ggplot(data=subset(metricsBySites$byAC$CountVariants, Novelty == "all" & nSNPs > 0), aes(x=AC, y=SNP.Indel.Ratio))
p <- p + opts(title = name)
p <- p + scale_y_continuous("SNP to indel ratio")
#p <- p + scale_y_log10()
p <- p + geom_point(alpha=0.5, aes(size=log10(nIndels)))
p <- p + geom_smooth(size=2, aes(weight=nIndels))
print(p)
name = "SNP counts by functional class"
molten = melt(subset(metricsBySites$bySite$CountVariants, Novelty != "all" & FunctionalClass != "all"), id.vars=c("Novelty", "FunctionalClass"), measure.vars=c(c("nSNPs")))
p <- ggplot(data=molten, aes(x=FunctionalClass, y=value, fill=Novelty), group=FunctionalClass)
p <- p + opts(title = name)
p <- p + scale_y_log10("No. of SNPs")
p <- p + geom_bar(position="dodge")
print(p)
}
addSection <- function(name) {
par("mar", c(5, 4, 4, 2))
frame()
title(name, cex=2)
}
# -------------------------------------------------------
# read functions
# -------------------------------------------------------
createMetricsBySamples <- function(VariantEvalRoot) {
bySampleEval <- expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".bySample.eval", sep="")))
r = merge(bySampleEval$TiTvVariantEvaluator, bySampleEval$CountVariants)
r = merge(r, bySampleEval$CompOverlap)
if ( ! is.na(preQCFile) ) {
preQCMetrics <- read.table(preQCFile, header=T)
r = merge(r, preQCMetrics)
}
# order the samples by nSNPs -- it's the natural ordering.
x = subset(r, Novelty=="all")
r$Sample <- factor(x$Sample, levels=x$Sample[order(x$nSNPs)])
# add highlight info
r$highlight = r$Sample %in% highlightSamples
#r = merge(merge(preQCMetrics, byACEval$TiTvVariantEvaluator), byACEval$CountVariants)
return(subset(r, Sample != "all"))
}
# -------------------------------------------------------
# Per sample plots
# -------------------------------------------------------
perSamplePlots <- function(metricsBySamples) {
metricsBySamples$highlightTextSizes = c(1,2)[metricsBySamples$highlight+1]
sampleTextLabel <- geom_text(aes(label=Sample, size=highlightTextSizes))
sampleTextLabelScale <- scale_size("Highlighted samples", to=c(3,5), breaks=c(1,2), labels=c("regular", "highlighted"))
xAxis <- scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "")
measures = c("nSNPs", "tiTvRatio", "nSingletons", "nIndels", "deletionInsertionRatio")
name = "by sample"
for ( measure in measures ) {
molten = melt(metricsBySamples, id.vars=c("Novelty", "Sample", "highlightTextSizes"), measure.vars=c(measure))
# distribution
p1 <- ggplot(data=molten, aes(x=value, group=Novelty, fill=Novelty))
#p1 <- p1 + opts(title = paste(measure, name))
p1 <- p1 + geom_density(alpha=0.5)
p1 <- p1 + geom_rug(aes(y=NULL, color=Novelty, position="jitter"))
p1 <- p1 + scale_x_continuous(measure)
p2 <- ggplot(data=molten, aes(x=Sample, y=value, group=Novelty, color=Novelty), y=value)
p2 <- p2 + opts(title = paste(measure, name))
p2 <- p2 + geom_smooth(alpha=0.5, aes(group=Novelty))
p2 <- p2 + sampleTextLabel + sampleTextLabelScale
p2 <- p2 + facet_grid(Novelty ~ ., scales="free")
p2 <- p2 + xAxis
distributePerSampleGraph(p1, p2)
}
# known / novel ratio by sample
# TODO -- would ideally not conflate SNPs and Indels
d = subset(metricsBySamples, Novelty == "all" & CompRod == "dbsnp")
title <- opts(title = "Novelty rate by sample")
# distribution
p1 <- ggplot(data=d, aes(x=compRate))
p1 <- p1 + geom_density(alpha=0.5)
p1 <- p1 + geom_rug(aes(y=NULL, position="jitter"))
p1 <- p1 + scale_x_continuous("Percent of variants in dbSNP")
p2 <- ggplot(data=d, aes(x=Sample, y=compRate))
p2 <- p2 + title
p2 <- p2 + geom_smooth(alpha=0.5, aes(group=Novelty))
p2 <- p2 + sampleTextLabel + sampleTextLabelScale
p2 <- p2 + geom_rug(aes(x=NULL, position="jitter"))
p2 <- p2 + xAxis
p2 <- p2 + scale_y_continuous("Percent of variants in dbSNP")
distributePerSampleGraph(p1, p2)
for ( novelty in c("all", "known", "novel") ) {
# TODO -- how can I color it as before?
# TODO -- add marginal distributions?
molten = melt(subset(metricsBySamples, Novelty==novelty), id.vars=c("Sample", "highlightTextSizes"), measure.vars=measures)
p <- ggplot(data=molten, aes(x=Sample, y=value))
p <- p + opts(title = paste(name, ":", novelty))
# p <- p + scale_y_log10("Number of variants")
# p <- p + geom_point(alpha=0.5, size=4)
p <- p + sampleTextLabel + sampleTextLabelScale
p <- p + facet_grid(variable ~ ., scales="free")
# how do we remove the labels?
p <- p + xAxis
print(p)
}
}
# -------------------------------------------------------
# Actually invoke the above plotting functions
# -------------------------------------------------------
# load the data.
if ( onCMDLine || LOAD_DATA ) {
metricsBySites <- createMetricsBySites(VariantEvalRoot)
metricsBySamples <- createMetricsBySamples(VariantEvalRoot)
}
if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
# Table of overall counts and quality
textplot(overallSummaryTable(metricsBySites), show.rownames=F)
title(paste("Summary metrics for project", ProjectName), cex=3)
# textplot(as.data.frame(sampleSummaryTable(metricsBySamples)), show.rownames=F)
# title(paste("Summary metrics per sample for project", ProjectName), cex=3)
summaryPlots(metricsBySites)
perSamplePlots(metricsBySamples)
if ( ! is.na(outputPDF) ) {
dev.off()
}

View File

@ -1,45 +0,0 @@
#########################################################################
# this script generates a plot of sample depth of coverage over the MHC.
# It's rather specific to that use case, but is a good example of getting
# Loess curve generation to work given a X/Y dataset.
#
# 12/9/2009
# -Aaron
#########################################################################
# setup our output PNG
png(filename="bySampleJPName.png",width=1500,height=700,bg="white")
# input our data set
tbl <- read.csv("docOutJP.csv",header=TRUE) # doc_JP_SN_totalled_clean.csv
par(las=1) # make all labels horizontal
par(xpd=T, mar=par()$mar+c(0,0,-2,4)) # adjust the margins to accommodate our legend
# do the initial plot of one column of data
plot(tbl[,1],tbl[,5],xlim=c(18517983,41461957),ylim=c(0,7),type="p",cex=0.2,axes=F,ylab="Average Read Depth Of Coverage",xlab="MHC Location",col=rgb(0,0,0,0.1))
# add the custom x and y axis, so we can control their layout
axis(1,pos=0,at=seq(18517983,42061957,by=500000),col.axis="black")
axis(2,pos=18517983,at=seq(0,7,by=1),col="black")
# setup two color schemes, both with the same colors. One has an alpha of 0.08 for the background points,
# and the other is alpha=1 for the lines (which we want to be vibrant in the foreground)
myColors <- rainbow(30,alpha=0.08)
myColors2 <- rainbow(30)
# add a legend. There is a better way to do this besides hard-coding it, but it wouldn't render correctly on my machine
legend(x=41000000,y=5,c("NA18940","NA18942","NA18943","NA18944","NA18945","NA18947","NA18948","NA18949","NA18951","NA18952","NA18953","NA18956","NA18959","NA18960","NA18961","NA18964","NA18965","NA18967","NA18968","NA18969","NA18970","NA18971","NA18972","NA18973","NA18974","NA18975","NA18976","NA18980","NA18981","NA19005"),horiz=FALSE,lty=c(1),col=c(myColors2),cex=0.8)
# loop over the remaining data sets, adding first the points to the graph, then calculating the loess points, and finally combining the points into a line
# the loess smoothing parts were inspired by: http://research.stowers-institute.org/efg/R/Statistics/loess.htm
# adjust the span value to adjust the sensitivity of curve to the local fit.
for (i in 4:33) {
points(tbl[,1],tbl[,i],col=myColors[i],cex=0.2)
y.loess <- loess(y ~ x, span=0.05, data.frame(x=tbl[,1], y=tbl[,i]))
y.predict <- predict(y.loess, data.frame(x=tbl[,1]))
lines(tbl[,1],y.predict,col=myColors2[i])
}
# close our png
dev.off()

View File

@ -1,323 +0,0 @@
# pOneSiteIsHom = p(top chromosome is ref AND bottom chromosome is ref) + p(top chromosome is var AND bottom chromosome is var)
# = (1-theta)^2 + theta^2
#
# pOneSiteIsHet = p(top chromosome is ref AND bottom chromosome is var) + p(top chromosome is var AND bottom chromosome is ref)
# = (1-theta)*theta + theta*(1-theta) = 2*theta*(1-theta)
pOneSiteIsHet <- function(theta) {
2 * theta * (1 - theta)
}
# p = 2 * theta * (1 - theta)
# and mean intra-het distance = 1/p, or d = 1/p
# or: p = 1/d
# or: 2 * theta * (1 - theta) = 1/d
# theta * (1 - theta) = 1/2d
# - theta^2 + theta - 1/2d = 0
#
# Using the quadratic equation:
# (- b + (b^2 - 4*a*c)^0.5) / 2a
# (-1 + (1 - 2/d)^0.5) / -2
meanIntraHetDistanceToTheta <- function(d) {
(-1 + (1 - 2/d)^0.5) / -2
}
# For consecutive diploid het sites x and y, P(distance(x,y) = k)
# = P(site y is the first het site downstream of x at distance = k | het site x exists at its location).
# That is, het site x already "exists", and we want to know what the probability that the NEXT het site (y) is k bases away.
pHetPairAtDistance <- function(k, theta) {
pOneSiteIsHetTheta = pOneSiteIsHet(theta)
dexp(k, pOneSiteIsHetTheta)
}
# Since the geometric/exponential distribution is "memory-free", can simply multiply the (independent) probabilities for the distances:
pHetPairsAtDistances <- function(dists, theta) {
prod(pHetPairAtDistance(dists, theta))
}
# Sample numDists distances from the intra-het distance distribution.
# [since the geometric/exponential distribution is "memory-free", can simply **independently** sample from the distribution]:
sampleIntraHetDistances <- function(numDists, theta) {
pOneSiteIsHetTheta = pOneSiteIsHet(theta)
ceiling(rexp(numDists, pOneSiteIsHetTheta)) # round up to get whole-number distances starting from 1
}
# For consecutive diploid het sites x and y, P(distance(x,y) <= k)
pHetPairLteDistance <- function(k, theta) {
# Although the real minimum distance starts with 1 (geometric distribution), the exponential distribution approximation starts with 0:
MIN_DISTANCE = 0
Vectorize(function(maxDist) integrate(function(dist) pHetPairAtDistance(dist, theta), lower=MIN_DISTANCE, upper=maxDist)$value)(k)
}
# Probability (over locations of x on the read) that a paired-end read ALREADY covering site x [with 2 mates of length L reading a fragment of length F] will ALSO cover site y (k bases downstream of x):
#
# If read 1 in mate spans [s1, e1] and read 2 spans [s2, e2], where length(read 1) = e1 - s1 + 1 = length(read 2) = e2 - s2 + 1 = L, then i = s2 - e1 - 1 [BY DEFINITION of i].
# i == "insert size" is DEFINED AS: F - 2 * L
#
#
# FOR i >= 0:
#
# Assume that read is equally likely to cover x at any of the 2L positions, so uniform probability of 1/2L at each of them.
# P(read r covers (x,y) | r covers x, r = [L,i,L], distance(x,y) = k)
# = sum_p=1^p=L {1/2L * 1{k <= L-p OR L-p+i+1 <= k <= 2L+i-p}} + sum_p=1^p=L {1/2L * 1{k <= L-p}}
# = 1/2L * [2 * sum_p=1^p=L {1{k <= L-p}} + sum_p=1^p=L {1{L-p+i+1 <= k <= 2L+i-p}}]
# = 1/2L * [2 * max(0, L-k) + max(0, min(L, max(0, k-i)) - max(0, k-i-L))]
#
#
pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance <- function(L, F, k) {
if (min(F) < 1) {
stop("Cannot have fragments of size < 1")
}
# if F < L, then set the effective read length to be F:
L = pmin(L, F)
i = F - 2 * L
#print(paste("pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L= (", paste(L, collapse=", "), "), F= (", paste(F, collapse=", "), "), k= (", paste(k, collapse=", "), ")), i= (", paste(i, collapse=", "), ")", sep=""))
# If i < 0, then ASSUMING that overlapping region is identical, we can "pretend" to have 2 reads of length L and L+i, with no insert between them.
# Otherwise, leave i alone and L1 = L2 = L:
L1 = L
L2 = L + pmin(0, i) # set effective length of second read to L+i if i < 0
i = pmax(0, i) # set effective insert size to be >= 0
pWithinSameMate = pmax(0, L1 - k) + pmax(0, L2 - k)
#maxValueFor_p = pmin(L1, pmax(0, k - i))
#minValueFor_p_minusOne = pmax(0, k - i - L2)
maxValueFor_p = pmin(L1, L1 + L2 + i - k)
minValueFor_p_minusOne = pmax(0, L1 - k + i)
pInDifferentMates = pmax(0, maxValueFor_p - minValueFor_p_minusOne)
(pWithinSameMate + pInDifferentMates) / (L1 + L2)
}
# Probability of having a fragment of size fragmentSize, where the fragment sizes are normally distributed with mean Fm and standard deviation Fs:
pFragmentSize <- function(fragmentSize, Fm, Fs) {
dnorm(fragmentSize, mean = Fm, sd = Fs)
}
# Probability (over locations of x on the read, and fragment sizes) that there could exist a paired-end read [with 2 mates of length L covering a fragment] covers both sites x and y (at distance k):
# Integral_from_0^to_INFINITY { pFragmentSize(s, Fm, Fs) * pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, s, k) ds }
pFragmentsReadsCanCoverHetPairAtDistance <- function(L, k, Fm, Fs) {
if (Fs != 0) {
pCoverageBySpecificFragment <- function(s) {pFragmentSize(s, Fm, Fs) * pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, s, k)}
MAX_NUM_SD = 10
maxDistance = MAX_NUM_SD * Fs
minFragmentSize = max(1, Fm - maxDistance) # NOT meaningful to have fragment size < 1
maxFragmentSize = Fm + maxDistance
integrate(pCoverageBySpecificFragment, lower=minFragmentSize, upper=maxFragmentSize)$value
}
else {# All fragments are of size exactly Fm:
pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, Fm, k)
}
}
# Probability (over locations of x on the read, fragment sizes, and read depths) that there exist at least nReadsToPhase paired-end reads covering both sites x and y (at distance k):
# = Sum_from_d=0^to_d=2*meanDepth { p(having d reads | poisson with meanDepth) * p(there at least nReadsToPhase succeed in phasing x,y | given d reads in total) }
# p(having d reads | poisson with meanDepth) = dpois(d, meanDepth)
# p(there are at least nReadsToPhase that succeed in phasing x,y | given d reads in total) = pbinom(nReadsToPhase - 1, k, pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs), lower.tail = FALSE)
pDirectlyPhaseHetPairAtDistanceUsingDepth_SINGLE_k <- function(meanDepth, nReadsToPhase, L, k, Fm, Fs) {
THRESH = 10^-8
p = pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs)
# deal with numerical issues:
if (abs(1 - p) < THRESH) {
p = 1
}
else if (abs(p) < THRESH) {
p = 0
}
pAtLeastNreadsToPhaseGivenDepth <- function(d) pbinom(nReadsToPhase - 1, d, p, lower.tail = FALSE)
pAtLeastNreadsToPhaseAndDepth <- function(d) dpois(d, meanDepth) * pAtLeastNreadsToPhaseGivenDepth(d)
minDepth = 0
maxDepth = 2 * meanDepth
sum(apply(as.matrix(minDepth:maxDepth), 1, pAtLeastNreadsToPhaseAndDepth))
}
pDirectlyPhaseHetPairAtDistanceUsingDepth <- function(meanDepth, nReadsToPhase, L, k, Fm, Fs) {
Vectorize(function(dist) pDirectlyPhaseHetPairAtDistanceUsingDepth_SINGLE_k(meanDepth, nReadsToPhase, L, dist, Fm, Fs))(k)
}
pDirectlyPhaseHetPairAndDistanceUsingDepth <- function(meanDepth, nReadsToPhase, L, k, theta, Fm, Fs) {
pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, Fm, Fs) * pHetPairAtDistance(k, theta)
}
# Probability (over locations of x on the read, fragment sizes, read depths, and het-het distances) that that there exist at least nReadsToPhase paired-end reads covering both sites x and y (where the distance between x and y is as per the geometric/exponential distribution):
pDirectlyPhaseHetPair <- function(meanDepth, nReadsToPhase, L, theta, Fm, Fs) {
# Although the real minimum distance starts with 1 (geometric distribution), the exponential distribution approximation starts with 0:
MIN_DISTANCE = 0
MAX_DISTANCE = Inf
iRes = integrate(function(k) pDirectlyPhaseHetPairAndDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, theta, Fm, Fs), lower=MIN_DISTANCE, upper=MAX_DISTANCE, subdivisions=1000, stop.on.error = FALSE)
if (iRes$message != "OK") {
print(paste("DISTANCE INTEGRATION WARNING: ", iRes$message, sep=""))
}
iRes$value
}
# Probability (over locations of sites on reads, fragment sizes, and read depths) that paired-end reads can TRANSITIVELY phase phaseIndex relative to phaseIndex - 1, given a window of length(windowDistances)+1 het sites at distances given by windowDistances (where an edge in the transitive path requires at least nReadsToPhase reads):
pPhaseHetPairAtDistanceUsingDepthAndWindow <- function(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs, MIN_PATH_PROB = 10^-6) {
n = length(windowDistances) + 1 # the window size
if (phaseIndex < 2 || phaseIndex > n) {
stop("phaseIndex < 2 || phaseIndex > n")
}
#print(paste("windowDistances= (", paste(windowDistances, collapse=", "), ")", sep=""))
# A. Pre-compute the upper diagonal of square matrix of n CHOOSE 2 values of:
# pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, dist(i,j), Fm, Fs)
#
# NOTE that the probabilities of phasing different pairs are NOT truly independent, but assume this for convenience...
#
pPhasePair = matrix(data = 0, nrow = n, ncol = n)
for (i in seq(from=1, to=n-1, by=1)) {
for (j in seq(from=i+1, to=n, by=1)) {
dist = distanceBetweenPair(i, j, windowDistances)
#print(paste("distanceBetweenPair(", i, ", ", j, ", windowDistances) = ", dist, sep=""))
pPhaseIandJ = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, dist, Fm, Fs)
pPhasePair[i, j] = pPhaseIandJ
pPhasePair[j, i] = pPhaseIandJ
}
}
#print(pPhasePair)
# B. We need to consider ALL possible paths from phaseIndex - 1 ---> phaseIndex
# There are: sum_i=0^to_n-2 {n-2 CHOOSE i * i!} such paths.
# Multiply the phasing probs along the path, and sum over all such paths:
#
startNode = phaseIndex - 1
endNode = phaseIndex
possibleIntermediateNodes = vector()
if (startNode > 1) possibleIntermediateNodes = c(possibleIntermediateNodes, seq(from=1, to=startNode-1, by=1))
if (endNode < n) possibleIntermediateNodes = c(possibleIntermediateNodes, seq(from=endNode+1, to=n, by=1))
#print(paste("possibleIntermediateNodes= {", paste(possibleIntermediateNodes, collapse=", "), "}", sep=""))
pWindowNotPhasing = 1
library(gtools)
for (subset in powerSet(length(possibleIntermediateNodes))) {
subset = possibleIntermediateNodes[subset]
#print((paste("subset = {", paste(subset, collapse=", "), "}", sep="")))
if (length(subset) == 0) {
paths = c()
}
else {
paths = permutations(length(subset), length(subset), v=subset)
}
# Add on the start and the end:
paths = cbind(startNode, paths, endNode)
for (i in 1:nrow(paths)) {
path = paths[i,]
pSpecificPathPhases = 1
for (j in seq(from=1, to=length(path)-1, by=1)) {
pSpecificPathPhases = pSpecificPathPhases * pPhasePair[path[j], path[j+1]]
if (pSpecificPathPhases < MIN_PATH_PROB) { # Do a "bounded" calculation [any path that is ALREADY of low probability can be discarded]:
#print(paste("pSpecificPathPhases= ", pSpecificPathPhases, sep=""))
pSpecificPathPhases = 0
break
}
}
pWindowNotPhasing = pWindowNotPhasing * (1 - pSpecificPathPhases)
#print((paste("path = (", paste(path, collapse=", "), "), pSpecificPathPhases= ", pSpecificPathPhases, sep="")))
}
}
1 - pWindowNotPhasing
}
# distance(i,j) = distance(i,i+1) + ... + distance(j-1,j), where distance(i,i+1) is given by windowDistances(i):
distanceBetweenPair <- function(i, j, windowDistances) {
if (i > j) {
tmp = i
i = j
j = tmp
}
if (i < 1 || j > length(windowDistances) + 1) {
stop(paste(i, " = i < 1 || ", j, " = j > length(windowDistances) + 1 = ", length(windowDistances) + 1, sep=""))
}
sum(windowDistances[i:(j-1)])
}
# n = size of set for which power set is to be returned
powerSet <- function(n) {
library(sfsmisc)
subsets = list()
for (i in seq(from=0, to=(2^n)-1, by=1)) {
subsets[i+1] = list(which(digitsBase(i, base = 2, ndigits = n) == 1))
}
subsets
}
pPhaseHetPairAndDistancesUsingDepthAndWindow <- function(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs, theta) {
p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs) * pHetPairsAtDistances(windowDistances, theta)
#print(paste(p, " = pPhaseHetPairAndDistancesUsingDepthAndWindow(windowDistances= (", paste(windowDistances, collapse=", "), "), phaseIndex= ", phaseIndex, ", meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, ", theta= ", theta, ") * pHetPairsAtDistances(windowDistances= ", paste(windowDistances, collapse=", "), ", theta= ", theta, ")", sep=""))
p
}
# Probability (over locations of sites on reads, fragment sizes, and read depths) that paired-end reads can TRANSITIVELY phase phaseIndex relative to phaseIndex - 1, given a window of n het sites at distances distributed as determined by theta (where an edge in the transitive path requires at least nReadsToPhase reads):
pDirectlyPhaseHetPairUsingWindow <- function(meanDepth, nReadsToPhase, L, theta, Fm, Fs, n, phaseIndex) {
if (n < 2) {
stop("n < 2")
}
ndim = n-1
integrandFunction <- function(windowDistances) {pPhaseHetPairAndDistancesUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs, theta)}
MIN_DISTANCE = 0
#
#MAX_DISTANCE = Inf
#
MAX_TAIL_PROB = 10^-6
MAX_DISTANCE = 7500 # Only 3e-07 [= 1 - pHetPairLteDistance(7500, 10^-3)] of the het-het pairs are at a distance > 7500
while (1 - pHetPairLteDistance(MAX_DISTANCE, theta) > MAX_TAIL_PROB) {
MAX_DISTANCE = MAX_DISTANCE * 2
}
lower = as.vector(matrix(data=MIN_DISTANCE, nrow=1, ncol=ndim))
upper = as.vector(matrix(data=MAX_DISTANCE, nrow=1, ncol=ndim))
N = 10^4 * ndim^2
high_dimensional_integrate(ndim, lower, upper, integrandFunction, N, DEBUG = TRUE, PRINT_EVERY = 10^2)
}
# Use the simplest version of the Monte Carlo method to integrate over a high-dimensional function:
high_dimensional_integrate <- function(ndim, lower, upper, integrandFunction, N = 10^4, DEBUG = FALSE, PRINT_EVERY = 10^3) {
rectangularVolume = prod(upper - lower)
sum = 0
for (i in 1:N) {
randVals = as.vector(matrix(data = NA, nrow=1, ncol=ndim))
for (j in 1:ndim) {
randVals[j] = runif(1, min=lower[j], max=upper[j])
}
#print(randVals)
evalFuncVal = integrandFunction(randVals)
sum = sum + evalFuncVal
if (DEBUG && (i-1) %% PRINT_EVERY == 0) {
estimate = rectangularVolume * (sum / i)
print(paste("high_dimensional_integrate: iteration ", i, ", estimate= ", estimate, sep=""))
}
}
rectangularVolume * (sum / N)
}
middleOfWindowIndex <- function(windowSize) {
floor(windowSize/2 + 1)
}

View File

@ -1,47 +0,0 @@
calcPhasingProbsForWindowDistances <- function(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME = NULL) {
WINDOW_SIZES = 2:MAX_WINDOW_SIZE
phaseProbsPositionWindow = matrix(data = NA, nrow=length(distances), ncol=length(WINDOW_SIZES))
for (i in 1:length(distances)) {
# Try to phase (i+1)-st position [relative to i] using varying window sizes:
for (j in 1:length(WINDOW_SIZES)) {
windowSize = WINDOW_SIZES[j]
remainingSize = windowSize - 2 # exlcude i, i+1
numOnLeft = i - 1
numOnRight = (length(distances) + 1) - (i + 2) + 1
if (numOnLeft <= numOnRight) {
halfToUse = floor(remainingSize / 2) # skimp on the left [floor], and be generous with the right side
useOnLeft = min(halfToUse, numOnLeft)
useOnRight = min(remainingSize - useOnLeft, numOnRight)
}
else {
halfToUse = ceiling(remainingSize / 2) # be generous with the right side [ceiling]
useOnRight = min(halfToUse, numOnRight)
useOnLeft = min(remainingSize - useOnRight, numOnLeft)
}
startInd = i - useOnLeft # go left from position i
stopInd = i + 1 + useOnRight # go right from position i + 1
usePositionRange = seq(from=startInd, to=stopInd, by=1)
useDistancesRange = seq(from=startInd, to=stopInd-1, by=1) # since there are N-1 distances between N consecutive positions
phaseIndex = which(usePositionRange == i+1)
if (length(phaseIndex) != 1) stop("NO phaseIndex!")
windowDistances = distances[useDistancesRange]
print(paste("Try to phase position ", i+1, " [relative to ", i, "] using positions: (", paste(usePositionRange, collapse=", "), "), windowDistances= (", paste(windowDistances, collapse=", "), "), [phaseIndex= ", phaseIndex, ", i=", i, "]", sep=""))
p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs)
print(paste("phase prob: ", p, sep=""))
phaseProbsPositionWindow[i, j] = p
}
if (!is.null(FILE_NAME)) {
save(list = ls(all=TRUE), file = paste(FILE_NAME, ".RData", sep=""))
}
}
list(phaseProbsPositionWindow=phaseProbsPositionWindow, WINDOW_SIZES=WINDOW_SIZES)
}

View File

@ -1,54 +0,0 @@
#
#options(warn=2)
#options(error=recover)
#
HALF = high_dimensional_integrate(1, -200, 0, dnorm)
print(paste("Should be ~ HALF: ", HALF, sep=""))
k = 75
#theta = 10^-2
theta = 10^-3
p = pHetPairLteDistance(k, theta)
print(paste(p, " = pHetPairLteDistance(k= ", k, ", theta= ", theta, ")", sep=""))
L = 76
fragmentSize = 452
p = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, fragmentSize, k)
print(paste(p, " = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L= ", L, ", fragmentSize= ", fragmentSize, ", k= ", k, ")", sep=""))
Fm = 392
Fs = 44
p = pFragmentSize(300, Fm, Fs)
print(paste(p, " = pFragmentSize(300, Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
p = pFragmentsReadsCanCoverHetPairAtDistance(L, k, Fm, Fs)
print(paste(p, " = pFragmentsReadsCanCoverHetPairAtDistance(L= ", L, ", k= ", k, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
meanDepth = 65
nReadsToPhase = 1
p = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, k, Fm, Fs)
print(paste(p, " = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", k= ", k, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
p = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, theta, Fm, Fs)
print(paste(p, " = pDirectlyPhaseHetPair(meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
windowDistances = c(100, 100, 100, 100, 100)
phaseIndex = 2
p = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances, phaseIndex, meanDepth, nReadsToPhase, L, Fm, Fs)
print(paste(p, " = pPhaseHetPairAtDistanceUsingDepthAndWindow(windowDistances= (", paste(windowDistances, collapse=", "), "), phaseIndex= ", phaseIndex, ", meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, ")", sep=""))
traceback()
warnings()

View File

@ -1,13 +0,0 @@
theta = 10^-3
params = paste("theta= ", theta, sep="")
MIN_DIST = 1
MAX_DIST = 10^4
BY_DIST = 10
DISTANCES = seq(from=MIN_DIST, to=MAX_DIST+BY_DIST, by=BY_DIST)
freqAtLteDist = pHetPairLteDistance(DISTANCES, theta)
scatter(DISTANCES, freqAtLteDist, "intraHetDistancesDistrib", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", main=params)
save(list = ls(all=TRUE), file = "intraHetDistancesDistrib.RData")

View File

@ -1,39 +0,0 @@
theta = 10^-3
Fm_BASE = 392 - 2 * 101 # The mean insert size == 190
Fs = 44
nReadsToPhase = 1
params = paste("nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", Fm_BASE= ", Fm_BASE, ", Fs= ", Fs, sep="")
MEAN_DEPTHS = 0:65
NUM_DEPTHS = length(MEAN_DEPTHS)
READ_LENGTHS = c(18, 36, 76, 101, 125, 150, 175, 200, 400, 800, 1000)
READ_LENGTHS = rev(READ_LENGTHS)
NUM_READ_LENGTHS = length(READ_LENGTHS)
depthsX = list()
depthsY = list()
depthsLeg = vector()
for (i in 1:NUM_READ_LENGTHS) {
pPhaseDepth = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_DEPTHS))
Fm = Fm_BASE + 2 * READ_LENGTHS[i]
for (j in 1:NUM_DEPTHS) {
pPhaseDepth[j] = pDirectlyPhaseHetPair(MEAN_DEPTHS[j], nReadsToPhase, READ_LENGTHS[i], theta, Fm, Fs)
}
depthsX[i] = list(MEAN_DEPTHS)
depthsY[i] = list(pPhaseDepth)
depthsLeg[i] = paste("L= ", READ_LENGTHS[i], sep="")
}
scatter(depthsX, depthsY, "testDepths", xlab="Mean depth", ylab="Phaseability", main=params, leg=depthsLeg, legPos="topleft", width=14, height=7, type="b")
save(list = ls(all=TRUE), file = "testDepths.RData")

View File

@ -1,47 +0,0 @@
theta = 10^-3
L = 101
meanDepth = 65
nReadsToPhase = 1
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, sep="")
MEAN_SIZES = seq(1,2000,20)
STD_SIZES = seq(0,200,5)
testFragments = matrix(nrow=length(MEAN_SIZES), ncol=length(STD_SIZES))
for (i in 1:length(MEAN_SIZES)) {
test_mean_fragment_size = MEAN_SIZES[i]
print(paste("test_mean_fragment_size: ", test_mean_fragment_size, sep=""))
for (j in 1:length(STD_SIZES)) {
test_std_fragment_size = STD_SIZES[j]
print(paste("test_std_fragment_size: ", test_std_fragment_size, sep=""))
testFragments[i,j] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, theta, test_mean_fragment_size, test_std_fragment_size)
}
}
pdf('testFragments.pdf')
library(gplots)
heatmap.2(testFragments, ylab = "Mean fragment size", xlab = "Standard deviation fragment size", labRow = MEAN_SIZES, labCol = STD_SIZES, Rowv = NA, Colv = NA, dendrogram = "none", scale="none", revC = FALSE, density.info="none", trace="none", main=params)
library(scatterplot3d)
xMeans = as.vector(t(matrix(rep.int(MEAN_SIZES, length(STD_SIZES)), ncol = length(STD_SIZES))))
yStds = rep.int(STD_SIZES, length(MEAN_SIZES))
zPhaseRate = as.vector(t(testFragments))
scatterplot3d(xMeans, yStds, zPhaseRate, xlab = "Mean fragment size", ylab = "Standard deviation fragment size", zlab = "Phasing rate", main=params)
bestCombo = which.max(zPhaseRate)
print(paste("For ", params, ", BEST choice gives phaseability of ", zPhaseRate[bestCombo], " using mean fragment = ", xMeans[bestCombo], ", std. fragment = ", yStds[bestCombo], sep = ""))
dev.off()
save(list = ls(all=TRUE), file = "testFragments.RData")

View File

@ -1,25 +0,0 @@
L = 101
Fm = 392
Fs = 44
meanDepth = 65
nReadsToPhase = 1
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
MEAN_INTRA_HET_DISTANCES = seq(from=2, to=20002, by=50)
THETAS = meanIntraHetDistanceToTheta(MEAN_INTRA_HET_DISTANCES)
NUM_THETAS = length(THETAS)
pPhaseTheta = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_THETAS))
for (i in 1:NUM_THETAS) {
pPhaseTheta[i] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, L, THETAS[i], Fm, Fs)
}
scatter(MEAN_INTRA_HET_DISTANCES, pPhaseTheta, "testIntraHetDistances", xlab="Mean intra-het distance", ylab="Phaseability", main=params, type="b")
save(list = ls(all=TRUE), file = "testIntraHetDistances.RData")

View File

@ -1,24 +0,0 @@
theta = 10^-3
Fm_BASE = 392 - 2 * 101 # The mean insert size == 190
Fs = 44
meanDepth = 65
nReadsToPhase = 1
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", Fm_BASE= ", Fm_BASE, ", Fs= ", Fs, sep="")
READ_LENGTHS = seq(from=30, to=1000, by=10)
NUM_READ_LENGTHS = length(READ_LENGTHS)
pPhaseReadLength = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_READ_LENGTHS))
for (i in 1:NUM_READ_LENGTHS) {
Fm = Fm_BASE + 2 * READ_LENGTHS[i]
pPhaseReadLength[i] = pDirectlyPhaseHetPair(meanDepth, nReadsToPhase, READ_LENGTHS[i], theta, Fm, Fs)
}
scatter(READ_LENGTHS, pPhaseReadLength, "testReadLengths", xlab="Read length", ylab="Phaseability", main=params, type="b")
save(list = ls(all=TRUE), file = "testReadLengths.RData")

View File

@ -1,19 +0,0 @@
L = 101
Fm = 392
Fs = 44
meanDepth = 65
nReadsToPhase = 1
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
DISTANCES = 0:1000
pPhaseHetPairAtDistWithRead = pDirectlyPhaseHetPairAtDistanceUsingDepth(meanDepth, nReadsToPhase, L, DISTANCES, Fm, Fs)
scatter(DISTANCES, pPhaseHetPairAtDistWithRead, "testSpecificDistances", xlab="Intra-het distance", ylab="Phaseability", main=params)
save(list = ls(all=TRUE), file = "testSpecificDistances.RData")

View File

@ -1,8 +0,0 @@
L = 76
k = 75
params = paste("L= ", L, ", k= ", k, sep="")
FRAGMENT_SIZES = 0:100 + 2 * L
pCoverHetPairWithRead = pPairedEndReadsOfSpecificFragmentCanCoverHetPairAtDistance(L, FRAGMENT_SIZES, k)
scatter(FRAGMENT_SIZES, pCoverHetPairWithRead, "testSpecificFragments", xlab="Fragment size", ylab="Probability of covering het pair", main=params)

View File

@ -1,32 +0,0 @@
theta = 10^-3
Fm = 392
Fs = 44
L = 101
meanDepth = 65
nReadsToPhase = 1
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", theta= ", theta, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
#
#options(warn=2)
#options(error=recover)
#
MAX_WINDOW_SIZE = 10
WINDOW_SIZES = 2:MAX_WINDOW_SIZE
NUM_WINDOW_SIZES = length(WINDOW_SIZES)
pPhaseWindow = as.vector(matrix(data = -1, nrow = 1, ncol = NUM_WINDOW_SIZES))
for (i in 1:NUM_WINDOW_SIZES) {
n = WINDOW_SIZES[i]
phaseIndex = middleOfWindowIndex(n)
pPhaseWindow[i] = pDirectlyPhaseHetPairUsingWindow(meanDepth, nReadsToPhase, L, theta, Fm, Fs, n, phaseIndex)
save(list = ls(all=TRUE), file = "testWindows.RData")
}
scatter(WINDOW_SIZES, pPhaseWindow, "testWindows", xlab="Window size", ylab="Phaseability", main=params, type="b")

View File

@ -1,28 +0,0 @@
L = 101
Fm = 392
Fs = 44
meanDepth = 65
nReadsToPhase = 1
theta = 10^-3
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", theta= ", theta, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
MAX_NUM_DISTS = 10^4
distances = sampleIntraHetDistances(MAX_NUM_DISTS, theta)
print(paste("Using ", MAX_NUM_DISTS, " THEORETICAL distances...", sep=""))
MAX_WINDOW_SIZE = 10
FILE_NAME = "theoretical_window"
phaseWindowResult = calcPhasingProbsForWindowDistances(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME)
phaseProbsPositionWindow = phaseWindowResult$phaseProbsPositionWindow
WINDOW_SIZES = phaseWindowResult$WINDOW_SIZES
phaseProbsWindow = colMeans(phaseProbsPositionWindow)
scatter(WINDOW_SIZES, phaseProbsWindow, FILE_NAME, xlab="Window size", ylab="Mean theoretical phasing rate on empirical distances", main=params, type="b")

View File

@ -1,30 +0,0 @@
L = 101
Fm = 392
Fs = 44
meanDepth = 65
nReadsToPhase = 1
params = paste("meanDepth= ", meanDepth, ", nReadsToPhase= ", nReadsToPhase, ", L= ", L, ", Fm= ", Fm, ", Fs= ", Fs, sep="")
distances = scan("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt", what=list(dist=0))
distances = distances$dist
MAX_NUM_DISTS = 10^4
NUM_DISTS_TO_USE = min(MAX_NUM_DISTS, length(distances))
distances = distances[1:NUM_DISTS_TO_USE]
print(paste("Using ", NUM_DISTS_TO_USE, " EMPIRICAL distances...", sep=""))
MAX_WINDOW_SIZE = 10
FILE_NAME = "theoretical_window_on_empirical"
phaseWindowResult = calcPhasingProbsForWindowDistances(distances, MAX_WINDOW_SIZE, meanDepth, nReadsToPhase, L, Fm, Fs, FILE_NAME)
phaseProbsPositionWindow = phaseWindowResult$phaseProbsPositionWindow
WINDOW_SIZES = phaseWindowResult$WINDOW_SIZES
phaseProbsWindow = colMeans(phaseProbsPositionWindow)
scatter(WINDOW_SIZES, phaseProbsWindow, FILE_NAME, xlab="Window size", ylab="Mean theoretical phasing rate on empirical distances", main=params, type="b")

View File

@ -1,181 +0,0 @@
NUM_chr1_HET_SITES = as.integer(system("grep -c 'chr1:' ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list", intern=TRUE))
NUM_chr1_PHASEABLE_HET_SITES = NUM_chr1_HET_SITES - 1 # since can't phase the first het site
#
#USE_EMPIRICAL_WINDOWS = c(10, 2)
#
USE_EMPIRICAL_WINDOWS = c(2)
TWO_COLORS = c("red", "darkgreen")
######################################################################
# Phasing as a function of SPECIFIC intra-het distances:
######################################################################
load("testSpecificDistances.RData")
MAX_DISTANCE = 10^3
PQ_PHASING_THRESH = 10.0
distances = list()
phaseRateDistances = list()
distancesLeg = vector()
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
n = USE_EMPIRICAL_WINDOWS[nextIndex]
n_locDistancePQReadsWindow <- scan(paste("~fromer/storage/phase.NA12878/phase_all_chr.n_", n, ".NA12878", ".locus_distance_PQ_numReads_windowSize.txt", sep=""), what=list(loci="", distance=0, PQ=0, reads=0, window=0))
n_distance <- n_locDistancePQReadsWindow$distance
n_PQ <- n_locDistancePQReadsWindow$PQ
distanceVector = sort(unique(n_distance))
distanceVector = distanceVector[which(distanceVector <= MAX_DISTANCE)]
numDists = length(distanceVector)
phasedFractionVector = as.vector(matrix(data=-1, nrow=1, ncol=numDists))
print(paste("numDists= ", numDists, sep=""))
print(paste(distanceVector, collapse=", "))
for (i in 1:numDists) {
d = distanceVector[i]
print(paste("d= ", d, sep=""))
dInds = which(n_distance == d)
phasedFractionVector[i] = length(which(n_PQ[dInds] >= PQ_PHASING_THRESH)) / length(dInds)
}
distances[nextIndex] = list(distanceVector)
phaseRateDistances[nextIndex] = list(phasedFractionVector)
distancesLeg[nextIndex] = paste("HiSeq (window = ", n, ")", sep="")
}
nextIndex = nextIndex+1
distances[nextIndex] = list(DISTANCES)
phaseRateDistances[nextIndex] = list(pPhaseHetPairAtDistWithRead)
distancesLeg[nextIndex] = "Theoretical (window = 2)" # params
scatter(distances, phaseRateDistances, "specific_distances.theoretical_empirical", xlab="Intra-het distance", ylab="Phaseability", leg=distancesLeg, legPos="topright", width=14, height=7, type="b", col=TWO_COLORS)
######################################################################
# Phasing as a function of depth:
######################################################################
load("testDepths.RData")
depths = list()
phaseRateDepths = list()
depthsLeg = vector()
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
n = USE_EMPIRICAL_WINDOWS[nextIndex]
RGdocPhasedConsistentSwitch = scan(paste("~fromer/storage/downsampled_phasing.NA12878.HiSeq/RG.DoC_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(RGdoc=0, phased=0, consistentPhased=0, switch=0.0))
depths[nextIndex] = list(RGdocPhasedConsistentSwitch$RGdoc)
phaseRateDepths[nextIndex] = list(RGdocPhasedConsistentSwitch$phased / NUM_chr1_PHASEABLE_HET_SITES)
depthsLeg[nextIndex] = paste("Down-sampled HiSeq (window = ", n, ")", sep="")
}
nextIndex = nextIndex+1
useLength = which(READ_LENGTHS == 101)
depths[nextIndex] = depthsX[useLength]
phaseRateDepths[nextIndex] = depthsY[useLength]
depthsLeg[nextIndex] = "Theoretical (window = 2)" # params
scatter(depths, phaseRateDepths, "depths.theoretical_empirical", xlab="Mean depth", ylab="Phaseability", leg=depthsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
######################################################################
# Distribution of intra-het distances:
######################################################################
load("intraHetDistancesDistrib.RData")
empiricalIntraHetDistances = read.table("~fromer/storage/phase.NA12878/COMPLETE_LIST.het_distances.txt")$V1
empiricalIntraHetDistances[which(empiricalIntraHetDistances >= MAX_DIST)] = MAX_DIST
empiricalIntraHetDistancesHist = hist(empiricalIntraHetDistances, breaks=DISTANCES, plot=FALSE)
empiricalIntraHetDistancesCumulativeFrequencies = cumsum(empiricalIntraHetDistancesHist$counts) / length(empiricalIntraHetDistances)
scatter(list(empiricalIntraHetDistancesHist$mids, DISTANCES), list(empiricalIntraHetDistancesCumulativeFrequencies, freqAtLteDist), "intraHetDistancesDistrib.theoretical_empirical", xlab="Intra-het distance", ylab="Cumulative Frequency", log="x", leg=c("NA12878 HiSeq", "Theoretical"), legPos="topleft", type="b", col=TWO_COLORS)
######################################################################
# Phasing as a function of MEAN intra-het distance:
######################################################################
load("testIntraHetDistances.RData")
hetDistances = list()
phaseRateHetDistances = list()
hetDistancesLeg = vector()
for (nextIndex in 1:length(USE_EMPIRICAL_WINDOWS)) {
n = USE_EMPIRICAL_WINDOWS[nextIndex]
meanHetDistNumSitesPhasedConsistentSwitch = scan(paste("~fromer/storage/remove_het_sites.NA12878.HiSeq/meanHetDist_numSites_phased_consistent_switch.chr1.n_", n, ".txt", sep=""), what=list(meanHetDist=0.0, numSites=0, phased=0, consistentPhased=0, switch=0.0))
hetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$meanHetDist)
phaseRateHetDistances[nextIndex] = list(meanHetDistNumSitesPhasedConsistentSwitch$phased)
hetDistancesLeg[nextIndex] = paste("Removed hets from HiSeq (window = ", n, ")", sep="")
}
nextIndex = nextIndex+1
hetDistances[nextIndex] = list(MEAN_INTRA_HET_DISTANCES)
phaseRateHetDistances[nextIndex] = list(pPhaseTheta)
hetDistancesLeg[nextIndex] = "Theoretical (window = 2)" # params
scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS)
scatter(hetDistances, phaseRateHetDistances, "intraHetDistances.log.theoretical_empirical", xlab="Mean intra-het distance", ylab="Phaseability", leg=hetDistancesLeg, legPos="topright", type="b", col=TWO_COLORS, log="y", xlim=c(1, 20000))
######################################################################
# Phasing as a function of window size:
######################################################################
load("theoretical_window_on_empirical.RData")
windows = list()
phaseRateWindows = list()
windowsLeg = vector()
NUM_HET_SITES = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | wc -l", intern=TRUE))
NUM_CHR = as.integer(system("cat ~fromer/storage/phase.NA12878/COMPLETE_LIST.het_sites.interval_list | cut -f1 -d':' | sort | uniq | wc -l", intern=TRUE))
NUM_PHASEABLE_HET_SITES = NUM_HET_SITES - NUM_CHR # since can't phase the first het site of each chromosome
windowPhasedConsistent = scan(paste("~fromer/storage/phase.NA12878/window_phased_consistent.txt", sep=""), what=list(window=0, phased=0, consistentPhased=0))
windows[1] = list(windowPhasedConsistent$window)
phaseRateWindows[1] = list(windowPhasedConsistent$phased / NUM_PHASEABLE_HET_SITES)
windowsLeg[1] = paste("HiSeq", sep="")
windows[2] = list(WINDOW_SIZES)
phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow)))
windowsLeg[2] = "Theoretical" # params
scatter(windows, phaseRateWindows, "windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
# Use numerical integration over theoretical distances distribution:
load("testWindows.RData")
doneInds = which(pPhaseWindow != -1)
windows[2] = list(WINDOW_SIZES[doneInds])
phaseRateWindows[2] = list(pPhaseWindow[doneInds])
windowsLeg[2] = "Theoretical" # params
scatter(windows, phaseRateWindows, "theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)
# Use theoretical sampling of distances:
load("theoretical_window.RData")
windows[2] = list(WINDOW_SIZES)
phaseRateWindows[2] = list(colMeans(na.omit(phaseProbsPositionWindow)))
windowsLeg[2] = "Theoretical" # params
scatter(windows, phaseRateWindows, "sampled_theoretical_distances.windows.theoretical_empirical", xlab="Window size", ylab="Phaseability", leg=windowsLeg, legPos="topleft", width=14, height=7, type="b", col=TWO_COLORS)

View File

@ -1,190 +0,0 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
annotationName = args[2]
minBinCutoff = as.numeric(args[3])
medianNumVariants = args[4]
c <- read.table(input, header=T)
all = c[c$numVariants>minBinCutoff & c$category=="all",]
novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
truth = c[c$numVariants>minBinCutoff & c$category=="truth",]
#
# Calculate min, max, medians
#
d = c[c$numVariants>minBinCutoff,]
ymin = min(d$titv)
ymax = max(d$titv)
xmin = min(d$value)
xmax = max(d$value)
m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
ma = all[all$value > m,]
mb = all[all$value < m,]
m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
if(medianNumVariants == "true") {
vc = cumsum( all$numVariants/sum(all$numVariants) )
m10 = all$value[ max(which(vc<=0.10)) ]
m25 = all$value[ max(which(vc<=0.25)) ]
m = all$value[ max(which(vc<=0.5)) ]
m75 = all$value[ min(which(vc>=0.75)) ]
m90 = all$value[ min(which(vc>=0.90)) ]
}
#
# Plot TiTv ratio as a function of the annotation
#
outfile = paste(input, ".TiTv.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
points(novel$value,novel$titv,col="green",pch=20)
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(truth$value,truth$titv,col="magenta",pch=20)
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
} else {
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
}
dev.off()
#
# Plot TiTv ratio as a function of the annotation, log scale on the x-axis
#
outfile = paste(input, ".TiTv_log.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
points(novel$value,novel$titv,col="green",pch=20)
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(truth$value,truth$titv,col="magenta",pch=20)
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
} else {
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
}
dev.off()
#
# Plot dbsnp and true positive rate as a function of the annotation
#
ymin = min(all$dbsnp)
ymax = max(all$dbsnp)
outfile = paste(input, ".truthRate.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
yLabel = "DBsnp Rate"
if( sum(all$truePositive==0) != length(all$truePositive) ) {
t = all[all$truePositive>0,]
yLabel = "DBsnp/True Positive Rate"
ymin = min(min(all$dbsnp),min(t$truePositive))
ymax = max(max(all$dbsnp),max(t$truePositive))
}
plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(t$value,t$truePositive,col="magenta",pch=20);
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
}
dev.off()
#
# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis
#
outfile = paste(input, ".truthRate_log.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
yLabel = "DBsnp Rate"
if( sum(all$truePositive==0) != length(all$truePositive) ) {
yLabel = "DBsnp/Truth Rate"
}
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(t$value,t$truePositive,col="magenta",pch=20);
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
}
dev.off()
#
# Plot histogram of the annotation's value
#
outfile = paste(input, ".Histogram.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
dev.off()
#
# Plot histogram of the annotation's value, log scale on x-axis
#
outfile = paste(input, ".Histogram_log.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
dev.off()

View File

@ -1,19 +0,0 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
annotationName = args[2]
data = read.table(input,sep=",",head=T)
outfile = paste(input, ".ClusterReport.pdf", sep="")
pdf(outfile, height=7, width=8)
maxP = max(data$knownDist, data$novelDist)
plot(data$annotationValue, data$knownDist, ylim=c(0,maxP),type="b",col="orange",lwd=2,xlab=annotationName,ylab="fraction of SNPs")
points(data$annotationValue, data$novelDist, type="b",col="blue",lwd=2)
legend('topright', c('knowns','novels'),lwd=2,col=c("orange","blue"))
dev.off()

View File

@ -1,67 +0,0 @@
args = commandArgs(TRUE);
RUNME = F
onCMDLine = ! is.na(args[1])
DATA_FILE = args[1]
DESCRIPTION = args[2]
#OUTPUT_PDF = paste(DATA_FILE, ".pdf", sep="")
MAX_POINTS = 100000
if ( onCMDLine ) {
print(paste("Reading data from", DATA_FILE))
d = read.table(DATA_FILE, header=T)
}
#if ( onCMDLine ) pdf(OUTPUT_PDF)
vec.margin <- function(x) {
l = length(x)
d = x[-1] - x[1:(l-1)]
c(x[1], d[1:(l-1)])
}
everyNth <- function(x, n) {
l = dim(x)[1]
m = ceiling(l / n)
print(m)
keep = 1:l %% m == 0
x[keep,]
}
l = length(d$units.processed)
d$units.processed.margin = vec.margin(d$units.processed)
#prev = 0
#for ( i in 1:l ) {
# cur = d$units.processed[i]
# d[i,]$units.processed.margin = cur - prev
# prev = cur
#}
generateOneReport <- function(d) {
qs = quantile(d$processing.speed, probs = c(0.01, 0.5, 0.99))
# unit processing time
if ( onCMDLine ) png(paste(DATA_FILE, ".speed.png", sep=""), width=1080, height=1080)
dpoints = everyNth(d, MAX_POINTS)
plot(dpoints$elapsed.time, dpoints$processing.speed, main=DESCRIPTION, xlab="Elapsed time (sec)", ylab="Processing speed (seconds per 1M units)", ylim=c(qs[1], qs[3]), type="b", col="cornflowerblue", lwd=2)
abline(h=qs[2], lty=2)
if ( onCMDLine ) dev.off()
# instantaneous processing speed
if ( onCMDLine ) png(paste(DATA_FILE, ".marginal.png", sep=""), width=1080, height=1080)
running_median_window = 101
rm = runmed(d$units.processed.margin, running_median_window)
POINT_COL = "#0000AA99"
plot(dpoints$elapsed.time, dpoints$units.processed.margin, main=DESCRIPTION, xlab="Elapsed time (sec)", ylab="Units processed in last timing interval", type="p", cex = 0.75, col=POINT_COL)
lines(d$elapsed.time, rm, lwd=3, col="red")
legend("topleft", c("Observations", "101-elt running median"), fill=c(POINT_COL, "red"))
if ( onCMDLine ) dev.off()
}
if ( RUNME ) {
generateOneReport(d)
}

View File

@ -1,36 +0,0 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
targetTITV = as.numeric(args[2])
# -----------------------------------------------------------------------------------------------
# optimization curve
# -----------------------------------------------------------------------------------------------
data = read.table(input,sep=",",head=T)
maxVars = max(data$numKnown, data$numNovel)
maxTITV = max(data$knownTITV[is.finite(data$knownTITV) & data$numKnown>2000], data$novelTITV[is.finite(data$novelTITV) & data$numNovel > 2000], targetTITV)
maxTITV = min(maxTITV, targetTITV + 1)
minTITV = min(data$knownTITV[length(data$knownTITV)], data$novelTITV[length(data$novelTITV)], targetTITV)
maxPCut = max(data$pCut[data$numKnown>0 | data$numNovel>0])
outfile = paste(input, ".optimizationCurve.pdf", sep="")
pdf(outfile, height=7, width=8)
par(mar=c(4,4,1,4),cex=1.3)
plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with QUAL >= X",ylab="",ylim=c(minTITV,maxTITV),xlim=c(0,maxPCut),col="Blue",pch=20)
points(data$pCut, data$novelTITV,,col="DarkBlue",pch=20)
abline(h=targetTITV,lty=3,col="Blue")
axis(side=2,col="DarkBlue")
axis(side=1)
mtext("Ti/Tv Ratio", side=2, line=2, col="blue",cex=1.4)
legend("left", c("Known Ti/Tv","Novel Ti/Tv"), col=c("Blue","DarkBlue"), pch=c(20,20),cex=0.7)
par(new=T)
plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),xlim=c(0,maxPCut),col="Green",pch=20)
points(data$pCut, data$numNovel,col="DarkGreen",pch=20)
axis(side=4,col="DarkGreen")
mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4)
legend("topright", c("Known","Novel"), col=c("Green","DarkGreen"), pch=c(20,20),cex=0.7)
dev.off()

View File

@ -1,87 +0,0 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
tranchesFile = args[1]
targetTITV = as.numeric(args[2])
targetSensitivity = as.numeric(args[3])
suppressLegend = ! is.na(args[4])
# -----------------------------------------------------------------------------------------------
# Useful general routines
# -----------------------------------------------------------------------------------------------
MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate
titvFPEst <- function(titvExpected, titvObserved) {
max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE)
}
titvFPEstV <- function(titvExpected, titvs) {
sapply(titvs, function(x) titvFPEst(titvExpected, x))
}
nTPFP <- function(nVariants, FDR) {
return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100)))
}
leftShift <- function(x, leftValue = 0) {
r = rep(leftValue, length(x))
for ( i in 1:(length(x)-1) ) {
#print(list(i=i))
r[i] = x[i+1]
}
r
}
# -----------------------------------------------------------------------------------------------
# Tranches plot
# -----------------------------------------------------------------------------------------------
data2 = read.table(tranchesFile,sep=",",head=T)
data2 = data2[order(data2$novelTiTv, decreasing=F),]
#data2 = data2[order(data2$FDRtranche, decreasing=T),]
cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
density=c(20, -1, -1, 20)
outfile = paste(tranchesFile, ".pdf", sep="")
pdf(outfile, height=5, width=8)
par(mar = c(5, 5, 4, 2) + 0.1)
novelTiTv = c(data2$novelTITV,data2$novelTiTv)
alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
#print(alpha)
numGood = round(alpha * data2$numNovel);
#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
numBad = data2$numNovel - numGood;
numPrevGood = leftShift(numGood, 0)
numNewGood = numGood - numPrevGood
numPrevBad = leftShift(numBad, 0)
numNewBad = numBad - numPrevBad
d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE)
#print(d)
barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000))
#abline(v= d[2,dim(d)[2]], lty=2)
#abline(v= d[1,3], lty=2)
if ( ! suppressLegend )
legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
# plot sensitivity vs. specificity
sensitivity = data2$truthSensitivity
if ( ! is.null(sensitivity) ) {
#specificity = titvFPEstV(targetTITV, novelTiTv)
specificity = novelTiTv
plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)")
abline(h=targetTITV, lty=2)
abline(v=targetSensitivity, lty=2)
#text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2)
}
dev.off()

View File

@ -1,108 +0,0 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
covariateName = args[2]
outfile = paste(input, ".indelQual_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
c <- read.table(input, header=T)
c <- c[sort.list(c[,1]),]
#
# Plot qual as a function of the covariate
#
d.good <- c[c$nBases >= 1000,]
d.1000 <- c[c$nBases < 1000,]
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
if( length(d.good$nBases) == length(c$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3))
}
# Don't let residual error go off the edge of the plot
d.good$residualError = d.good$Qempirical#-d.good$Qreported
#d.good$residualError[which(d.good$residualError > 10)] = 10
#d.good$residualError[which(d.good$residualError < -10)] = -10
d.1000$residualError = d.1000$Qempirical#-d.1000$Qreported
#d.1000$residualError[which(d.1000$residualError > 10)] = 10
#d.1000$residualError[which(d.1000$residualError < -10)] = -10
c$residualError = c$Qempirical
#c$residualError[which(c$residualError > 10)] = 10
#c$residualError[which(c$residualError < -10)] = -10
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical Indel Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-0, 50), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical Indel Quality", xlab=covariateName, col="blue", ylim=c(0, 50))
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot mean quality versus the covariate
#
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
theTitle = paste("Quality By", covariateName);
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot histogram of the covariate
#
e = d.good
f = d.1000
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
lwdSize=2
if( length(c$Covariate) <= 20 ) {
lwdSize=7
} else if( length(c$Covariate) <= 70 ) {
lwdSize=4
}
if( is.numeric(c$Covariate) ) {
if( length(hst$e.Covariate) == 0 ) {
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
} else {
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
if( length(hst$c.Covariate) > 9 ) {
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
} else {
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
}
dev.off()

View File

@ -1,108 +0,0 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
covariateName = args[2]
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
c <- read.table(input, header=T)
c <- c[sort.list(c[,1]),]
#
# Plot residual error as a function of the covariate
#
d.good <- c[c$nBases >= 1000,]
d.1000 <- c[c$nBases < 1000,]
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
if( length(d.good$nBases) == length(c$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3))
}
# Don't let residual error go off the edge of the plot
d.good$residualError = d.good$Qempirical-d.good$Qreported
d.good$residualError[which(d.good$residualError > 10)] = 10
d.good$residualError[which(d.good$residualError < -10)] = -10
d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
d.1000$residualError[which(d.1000$residualError > 10)] = 10
d.1000$residualError[which(d.1000$residualError < -10)] = -10
c$residualError = c$Qempirical-c$Qreported
c$residualError[which(c$residualError > 10)] = 10
c$residualError[which(c$residualError < -10)] = -10
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot mean quality versus the covariate
#
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
theTitle = paste("Quality By", covariateName);
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot histogram of the covariate
#
e = d.good
f = d.1000
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
lwdSize=2
if( length(c$Covariate) <= 20 ) {
lwdSize=7
} else if( length(c$Covariate) <= 70 ) {
lwdSize=4
}
if( is.numeric(c$Covariate) ) {
if( length(hst$e.Covariate) == 0 ) {
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
} else {
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
if( length(hst$c.Covariate) > 9 ) {
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
} else {
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
}
dev.off()

View File

@ -1,70 +0,0 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
input = args[1]
Qcutoff = as.numeric(args[2])
maxQ = as.numeric(args[3])
maxHist = as.numeric(args[4])
t=read.table(input, header=T)
#
# Plot of reported quality versus empirical quality
#
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
pdf(outfile, height=7, width=7)
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
f <- t[t$Qreported < Qcutoff,]
e <- rbind(d.good, d.1000, d.10000)
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3));
}
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
abline(0,1, lty=2)
dev.off()
#
# Plot Q empirical histogram
#
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
entropy = -sum(log2(percentBases)*percentBases)
yMax = max(hst$e.nBases)
if(maxHist != 0) {
yMax = maxHist
}
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()
#
# Plot Q reported histogram
#
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
yMax = max(hst$e.nBases)
if(maxHist != 0) {
yMax = maxHist
}
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()

View File

@ -1,21 +0,0 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
data = read.table(input,sep=",",head=T)
numCurves = (length(data) - 1)/3
maxSpec = max(data[,(1:numCurves)*3])
outfile = paste(input, ".variantROCCurve.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.3)
plot(data$specificity1,data$sensitivity1, type="n", xlim=c(0,maxSpec),ylim=c(0,1),xlab="1 - Specificity",ylab="Sensitivity")
for(iii in 1:numCurves) {
points(data[,iii*3],data[,(iii-1)*3+2],lwd=3,type="l",col=iii)
}
legend("bottomright", names(data)[(0:(numCurves-1))*3+1], col=1:numCurves,lwd=3)
dev.off()

View File

@ -1,97 +0,0 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
fileToRead <- args[1]
functionToRun <- args[2]
functionSpecificArgs <- args[3]
## load the function to run
if ( funtionToRun == "PlotInterleavedRows" ) {
### PLOT INTERLEAVED ROWS FUNCTION ###
# - expects a file of the form
#
# sample_a \t 0.8 \t 0.6 \t 0.5
# sample_a \t 0 \t 1 \t 3
# sample_b \t 0.5 \t 0.3 \t 0.1
# sample_b \t 1 \t 2 \t 4
#
# and an argument string
# x_label;y_label;plot_title;base_name_for_pdf
# - end of info -
### PLOT INTERLEAVED ROWS FUNCTION ###
PlotInterleavedRows <- function(inFile,args) {
arglist = unlist(strsplit(args,";"))
xlabel = arglist[1]
ylabel = arglist[2]
title = arglist[3]
outFileBase = arglist[4]
allPoints <- as.matrix(read.table(inFile))
# set up colors
colors = rainbow(ncol(allPoints)-1,s=0.8,v=0.8,gamma=0.6,start=0.0,end=0.9)
styles = c(rep(1,ncol(allPoints)-1))
evalPoints = matrix(nrow=nrow(allPoints)/2,ncol=ncol(allPoints))
funcVal = matrix(nrow=nrow(allPoints)/2,ncol=ncol(allPoints))
# convert to two matrices by de-interleaving and transposing
for ( i in 1:(nrow(allPoints)/2) ) {
evalPoints[i,] <- allPoints[2*i,]
funcVal[i,] <- allPoints[2*i-1,]
}
evalPoints <- t(evalPoints)
funcVal <- t(funcVal)
# plot and put legend on
pdf(paste(outFileBase,"_rplot",".pdf",sep=""))
matplot(evalPoints,funcVal,col=colors,lty=styles,"l",xlab=xlabel,ylab=ylabel)
legend("topright",funcVal[1,],lty=styles,col=colors)
title(main=title,outer=TRUE)
# save
dev.off()
}
PlotInterleavedRows(fileToRead,functionSpecificArgs)
}
if ( functionToRun == "PlotHeatmap" ) {
### PLOT HEATMAP FUNCTION ###
#
# Normally what is meant by "heatmap" is just an image() of the
# matrix; in accordance with that, THIS FUNCTION DOES NOT COMPUTE
# DENDROGRAMS THROUGH HEATMAP(), so no rows and columns are not
# re-ordered, and dendrograms are not displayed.
#
# - expects a file of the form
#
# rentry1 \t rentry2 \t rentry3 \t ...
# colentry1 \t 0.7 \t 0.9 \t 0.4 \t ...
# colentry2 \t 0.8 \t 0.7 \t 0.6 \t ...
# ...
# Note that the rows and columns don't line up. R understands this
# and deals with it.
# Also expects an argument string:
# row_label;column_label;plot_title;base_name_for_pdf
# - end of info -
### PLOT HEATMAP FUNCTION ###
PlotHeatmap <- function(inFile,args) {
arglist = unlist(strsplit(args,split=";"))
row_label = arglist[1]
column_label = arglist[2]
data_rescale_factor <- as.numeric(arglist[3])
plot_title = arglist[4]
base_name_for_pdf = arglist[5]
image_matrix <- as.matrix(read.table(inFile))
## change default colors to include "cool" colors for lower end of spectrum
## e.g. red ~ near 1, yellow ~ near .75, green ~ near .5, teal ~ near .25
## blue ~ near 0
colors <- rev(rainbow(32,start=0,end=0.6,s=0.9,v=0.9,gamma=0.8))
pdf(paste(base_name_for_pdf,"_rplot",".pdf",sep=""))
heatmap(image_matrix,Rowv=NA,Colv=NA,ylab=row_label,xlab=column_label,col=colors)
title(main=plot_title,outer=TRUE)
dev.off()
}
PlotHeatmap(fileToRead,functionSpecificArgs)
}

View File

@ -1,61 +0,0 @@
MAX_AC = 10000
normHist <- function(d, m) {
x = hist(d$true.ac, breaks=1:20000, plot=F)$counts[1:MAX_AC]
x / sum(x)
}
f <- function(d, acs) {
cols = rainbow(length(acs), alpha=0.75)
y = normHist(subset(afs, small.ac == acs[1]))
x = 1:length(y) / max(d$true.an)
plot(x, y, type="l", col=cols[1], xlab="True MAF in full population", ylab="Frequency", lwd=3, log="x")
for (i in 2:length(acs)) {
points(x, normHist(subset(afs, small.ac == acs[i])), type="l", col=cols[i], lwd=3)
}
legend("topright", legend=lapply(acs, function(x) paste("AC =", x)), fill=cols, title="Sub-population")
}
expected <- function(maxAN, N, eps, ac1scale = F) {
scale = 10
f <- function(ps, N) {
co = 2 * N / ( 1 - eps )
co * ((1 - ps)/(1-eps))^(2 * N - 1)
}
# these are the points that we'll actually show, but we need to do the calculation
# special for the AC = 1 given the equation actually fits an infinite population
# not a discrete population with max chromosomes
ps = 1:maxAN / maxAN
v = f(ps, N)
v = v / sum(v)
if ( ac1scale ) {
subps = seq(1, maxAN*scale) / (maxAN * scale)
#print(subps)
subv = f(subps, N)
#print(subv)
#print(v[1:10])
pBelowAC1 = sum(subv[1:scale] / sum(subv))
#print(list(pBelowAC1=pBelowAC1, v1=v[1]))
v[1] = v[1] + pBelowAC1
}
list(ps = ps, pr = v)
}
f(afs, c(1,2,3,5,10,50))
if ( F ) {
scale = 100
ex1 = expected(200000, 1000, 1e-8)
ex2 = expected(200000*scale, 1000, 1e-8)
i = 1:(200000*scale) %% scale == 1
plot(ex2$ps[i], cumsum(ex1$pr), type="l",lty=3,lwd=3, log="x", col="red")
points(ex2$ps[i], cumsum(ex2$pr)[i], type="l",lty=3,lwd=3, log="x")
}
ex = expected(200000, 1000, 1e-8, T)
points(ex$ps, ex$pr, type="l",lty=3,lwd=3)

View File

@ -1,10 +0,0 @@
Package: gsalib
Type: Package
Title: Utility functions
Version: 1.0
Date: 2010-10-02
Author: Kiran Garimella
Maintainer: Kiran Garimella <kiran@broadinstitute.org>
Description: Utility functions for GATK NGS analyses
License: BSD
LazyLoad: yes

View File

@ -1,12 +0,0 @@
gsa.error <- function(message) {
message("");
gsa.message("Error: **********");
gsa.message(sprintf("Error: %s", message));
gsa.message("Error: **********");
message("");
traceback();
message("");
stop(message, call. = FALSE);
}

View File

@ -1,116 +0,0 @@
.gsa.getargs.usage <- function(argspec, doc) {
cargs = commandArgs();
usage = "Usage:";
fileIndex = grep("--file=", cargs);
if (length(fileIndex) > 0) {
progname = gsub("--file=", "", cargs[fileIndex[1]]);
usage = sprintf("Usage: Rscript %s [arguments]", progname);
if (!is.na(doc)) {
message(sprintf("%s: %s\n", progname, doc));
}
}
message(usage);
for (argname in names(argspec)) {
key = argname;
defaultValue = 0;
doc = "";
if (is.list(argspec[[argname]])) {
defaultValue = argspec[[argname]]$value;
doc = argspec[[argname]]$doc;
}
message(sprintf(" -%-10s\t[default: %s]\t%s", key, defaultValue, doc));
}
message("");
stop(call. = FALSE);
}
gsa.getargs <- function(argspec, doc = NA) {
argsenv = new.env();
for (argname in names(argspec)) {
value = 0;
if (is.list(argspec[[argname]])) {
value = argspec[[argname]]$value;
} else {
value = argspec[[argname]];
}
assign(argname, value, envir=argsenv);
}
if (interactive()) {
for (argname in names(argspec)) {
value = get(argname, envir=argsenv);
if (is.na(value) | is.null(value)) {
if (exists("cmdargs")) {
assign(argname, cmdargs[[argname]], envir=argsenv);
} else {
assign(argname, readline(sprintf("Please enter a value for '%s': ", argname)), envir=argsenv);
}
} else {
assign(argname, value, envir=argsenv);
}
}
} else {
cargs = commandArgs(TRUE);
if (length(cargs) == 0) {
.gsa.getargs.usage(argspec, doc);
}
for (i in 1:length(cargs)) {
if (length(grep("^-", cargs[i], ignore.case=TRUE)) > 0) {
key = gsub("-", "", cargs[i]);
value = cargs[i+1];
if (key == "h" | key == "help") {
.gsa.getargs.usage(argspec, doc);
}
if (length(grep("^[\\d\\.e\\+\\-]+$", value, perl=TRUE, ignore.case=TRUE)) > 0) {
value = as.numeric(value);
}
assign(key, value, envir=argsenv);
}
}
}
args = as.list(argsenv);
isMissingArgs = 0;
missingArgs = c();
for (arg in names(argspec)) {
if (is.na(args[[arg]]) | is.null(args[[arg]])) {
gsa.warn(sprintf("Value for required argument '-%s' was not specified", arg));
isMissingArgs = 1;
missingArgs = c(missingArgs, arg);
}
}
if (isMissingArgs) {
gsa.error(
paste(
"Missing required arguments: -",
paste(missingArgs, collapse=" -"),
". Specify -h or -help to this script for a list of available arguments.",
sep=""
)
);
}
args;
}

View File

@ -1,3 +0,0 @@
gsa.message <- function(message) {
message(sprintf("[gsalib] %s", message));
}

View File

@ -1,50 +0,0 @@
gsa.plot.venn <-
function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0,
col=c("#FF6342", "#63C6DE", "#ADDE63"),
pos=c(0.20, 0.20, 0.80, 0.82),
debug=0
) {
library(png);
library(graphics);
# Set up properties
for (i in 1:length(col)) {
rgbcol = col2rgb(col[i]);
col[i] = sprintf("%02X%02X%02X", rgbcol[1], rgbcol[2], rgbcol[3]);
}
chco = paste(col[1], col[2], col[3], sep=",");
chd = paste(a, b, c, a_and_b, a_and_c, b_and_c, sep=",");
props = c(
'cht=v',
'chs=525x525',
'chds=0,10000000000',
paste('chco=', chco, sep=""),
paste('chd=t:', chd, sep="")
);
proplist = paste(props[1], props[2], props[3], props[4], props[5], sep='&');
# Get the venn diagram (as a temporary file)
filename = tempfile("venn");
cmd = paste("wget -O ", filename, " 'http://chart.apis.google.com/chart?", proplist, "' > /dev/null 2>&1", sep="");
if (debug == 1) {
print(cmd);
}
system(cmd);
# Render the temp png file into a plotting frame
a = readPNG(filename);
plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab="");
if (c == 0 || a >= b) {
rasterImage(a, pos[1], pos[2], pos[3], pos[4]);
} else {
rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180);
}
# Clean up!
unlink(filename);
}

View File

@ -1,83 +0,0 @@
.gsa.attemptToLoadFile <- function(filename) {
file = NA;
if (file.exists(filename) & file.info(filename)$size > 500) {
file = read.csv(filename, header=TRUE, comment.char="#");
}
file;
}
gsa.read.eval <-
function(evalRoot) {
fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep="");
fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
eval = list(
AlleleCountStats = NA,
CompOverlap = NA,
CountVariants = NA,
GenotypeConcordance = NA,
MetricsByAc = NA,
MetricsBySample = NA,
Quality_Metrics_by_allele_count = NA,
QualityScoreHistogram = NA,
SampleStatistics = NA,
SampleSummaryStatistics = NA,
SimpleMetricsBySample = NA,
TiTv = NA,
TiTvStats = NA,
Variant_Quality_Score = NA,
CallsetNames = c(),
CallsetOnlyNames = c(),
CallsetFilteredNames = c()
);
eval$AlleleCountStats = .gsa.attemptToLoadFile(fileAlleleCountStats);
eval$CompOverlap = .gsa.attemptToLoadFile(fileCompOverlap);
eval$CountVariants = .gsa.attemptToLoadFile(fileCountVariants);
eval$GenotypeConcordance = .gsa.attemptToLoadFile(fileGenotypeConcordance);
eval$MetricsByAc = .gsa.attemptToLoadFile(fileMetricsByAc);
eval$MetricsBySample = .gsa.attemptToLoadFile(fileMetricsBySample);
eval$Quality_Metrics_by_allele_count = .gsa.attemptToLoadFile(fileQuality_Metrics_by_allele_count);
eval$QualityScoreHistogram = .gsa.attemptToLoadFile(fileQualityScoreHistogram);
eval$SampleStatistics = .gsa.attemptToLoadFile(fileSampleStatistics);
eval$SampleSummaryStatistics = .gsa.attemptToLoadFile(fileSampleSummaryStatistics);
eval$SimpleMetricsBySample = .gsa.attemptToLoadFile(fileSimpleMetricsBySample);
eval$TiTv = .gsa.attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
eval$TiTvStats = .gsa.attemptToLoadFile(fileTiTvStats);
eval$Variant_Quality_Score = .gsa.attemptToLoadFile(fileVariant_Quality_Score);
uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
eval$CallsetFilteredNames = as.vector(c(
paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""),
paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep=""))
);
if (!(eval$CallsetFilteredNames[1] %in% unique(eval$TiTv$jexl_expression))) {
eval$CallsetFilteredNames[1] = paste("In", eval$CallsetNames[1], "-FilteredIn", eval$CallsetNames[2], sep="");
}
if (!(eval$CallsetFilteredNames[2] %in% unique(eval$TiTv$jexl_expression))) {
eval$CallsetFilteredNames[2] = paste("In", eval$CallsetNames[2], "-FilteredIn", eval$CallsetNames[1], sep="");
#eval$CallsetFilteredNames[2] = paste(gsub("^(\\w)", "In", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In", eval$CallsetNames[1], perl=TRUE), sep="");
}
eval;
}

View File

@ -1,64 +0,0 @@
# Load a table into the specified environment. Make sure that each new table gets a unique name (this allows one to cat a bunch of tables with the same name together and load them into R without each table overwriting the last.
.gsa.assignGATKTableToEnvironment <- function(tableName, tableHeader, tableRows, tableEnv) {
d = data.frame(tableRows, row.names=NULL, stringsAsFactors=FALSE);
colnames(d) = tableHeader;
for (i in 1:ncol(d)) {
v = suppressWarnings(as.numeric(d[,i]));
if (length(na.omit(as.numeric(v))) == length(d[,i])) {
d[,i] = v;
}
}
usedNames = ls(envir=tableEnv, pattern=tableName);
if (length(usedNames) > 0) {
tableName = paste(tableName, ".", length(usedNames), sep="");
}
assign(tableName, d, envir=tableEnv);
}
# Load all GATKReport tables from a file
gsa.read.gatkreport <- function(filename) {
con = file(filename, "r", blocking = TRUE);
lines = readLines(con);
close(con);
tableEnv = new.env();
tableName = NA;
tableHeader = c();
tableRows = c();
for (line in lines) {
if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
headerFields = unlist(strsplit(line, "[[:space:]]+"));
if (!is.na(tableName)) {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
}
tableName = headerFields[2];
tableHeader = c();
tableRows = c();
} else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) {
# do nothing
} else if (!is.na(tableName)) {
row = unlist(strsplit(line, "[[:space:]]+"));
if (length(tableHeader) == 0) {
tableHeader = row;
} else {
tableRows = rbind(tableRows, row);
}
}
}
if (!is.na(tableName)) {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
}
gatkreport = as.list(tableEnv);
}

View File

@ -1,28 +0,0 @@
gsa.read.squidmetrics = function(project, bylane = FALSE) {
suppressMessages(library(ROracle));
drv = dbDriver("Oracle");
con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD");
if (bylane) {
statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS WHERE \"Project\" = '", project, "'", sep="");
print(statement);
rs = dbSendQuery(con, statement = statement);
d = fetch(rs, n=-1);
dbHasCompleted(rs);
dbClearResult(rs);
} else {
statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG WHERE \"Project\" = '", project, "'", sep="");
print(statement);
rs = dbSendQuery(con, statement = statement);
d = fetch(rs, n=-1);
dbHasCompleted(rs);
dbClearResult(rs);
}
oraCloseDriver(drv);
subset(d, Project == project);
}

View File

@ -1,3 +0,0 @@
gsa.warn <- function(message) {
gsa.message(sprintf("Warning: %s", message));
}

View File

@ -1,9 +0,0 @@
* Edit the help file skeletons in 'man', possibly combining help files
for multiple functions.
* Put any C/C++/Fortran code in 'src'.
* If you have compiled code, add a .First.lib() function in 'R' to load
the shared library.
* Run R CMD build to build the package tarball.
* Run R CMD check to check the package tarball.
Read "Writing R Extensions" for more information.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 49 KiB

View File

@ -1,49 +0,0 @@
\name{gsa.error}
\alias{gsa.error}
\title{
GSA error
}
\description{
Write an error message to standard out with the prefix '[gsalib] Error:', print a traceback, and exit.
}
\usage{
gsa.error(message)
}
%- maybe also 'usage' for other objects documented here.
\arguments{
\item{message}{
The error message to write.
}
}
\details{
%% ~~ If necessary, more details than the description above ~~
}
\value{
%% ~Describe the value returned
%% If it is a LIST, use
%% \item{comp1 }{Description of 'comp1'}
%% \item{comp2 }{Description of 'comp2'}
%% ...
}
\references{
%% ~put references to the literature/web site here ~
}
\author{
Kiran Garimella
}
\note{
%% ~~further notes~~
}
%% ~Make other sections like Warning with \section{Warning }{....} ~
\seealso{
%% ~~objects to See Also as \code{\link{help}}, ~~~
}
\examples{
gsa.error("This is a message");
}
% Add one or more standard keywords, see file 'KEYWORDS' in the
% R documentation directory.
\keyword{ ~kwd1 }
\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line

View File

@ -1,57 +0,0 @@
\name{gsa.getargs}
\alias{gsa.getargs}
\title{
Get script arguments
}
\description{
Get script arguments given a list object specifying arguments and documentation. Can be used in command-line or interactive mode. This is helpful when developing scripts in interactive mode that will eventually become command-line programs. If no arguments are specified or help is requested in command-line mode, the script will print out a usage statement with available arguments and exit.
}
\usage{
gsa.getargs(argspec, doc = NA)
}
\arguments{
\item{argspec}{
A list object. Each key is an argument name. The value is another list object with a 'value' and 'doc' keys. For example:
\preformatted{argspec = list(
arg1 = list(value=10, doc="Info for optional arg1"),
arg2 = list(value=NA, doc="Info for required arg2")
);
}
If the value provided is NA, the argument is considered required and must be specified when the script is invoked. For command-line mode, this means the argument must be specified on the command-line. In interactive mode, there are two ways of specifying these arguments. First, if a properly formatted list argument called 'cmdargs' is present in the current environment (i.e. the object returned by gsa.getargs() from a previous invocation), the value is taken from this object. Otherwise, the argument is prompted for.
}
\item{doc}{
An optional string succinctly documenting the purpose of the script.
}
}
\details{
Interactive scripts typically make use of hardcoded filepaths and parameter settings. This makes testing easy, but generalization to non-interactive mode more difficult. This utility provides a mechanism for writing scripts that work properly in both interactive and command-line modes.
To use this method, specify a list with key-value pairs representing the arguments as specified above. In command-line mode, if no arguments are specified or the user specifies '-h' or '-help' anywhere on the command string, a help message indicating available arguments, their default values, and some documentation about the argument are provided.
}
\value{
Returns a list with keys matching the argspec and values representing the specified arguments.
\item{arg1 }{Value for argument 1}
\item{arg2 }{Value for argument 2}
...etc.
}
\references{
%% ~put references to the literature/web site here ~
}
\author{
Kiran Garimella
}
\examples{
argspec = list(
file = list(value="/my/test.vcf", doc="VCF file"),
verbose = list(value=0, doc="If 1, set verbose mode"),
test2 = list(value=2.3e9, doc="Another argument that does stuff")
);
cmdargs = gsa.getargs(argspec, doc="My test program");
print(cmdargs$file); # will print '[1] "/my/test.vcf"'
}
\keyword{ ~kwd1 }

View File

@ -1,44 +0,0 @@
\name{gsa.message}
\alias{gsa.message}
\title{
GSA message
}
\description{
Write a message to standard out with the prefix '[gsalib]'.
}
\usage{
gsa.message(message)
}
\arguments{
\item{message}{
The message to write.
}
}
\details{
%% ~~ If necessary, more details than the description above ~~
}
\value{
%% ~Describe the value returned
%% If it is a LIST, use
%% \item{comp1 }{Description of 'comp1'}
%% \item{comp2 }{Description of 'comp2'}
%% ...
}
\references{
%% ~put references to the literature/web site here ~
}
\author{
Kiran Garimella
}
\note{
%% ~~further notes~~
}
\seealso{
%% ~~objects to See Also as \code{\link{help}}, ~~~
}
\examples{
## Write message to stdout
gsa.message("This is a message");
}
\keyword{ ~kwd1 }

View File

@ -1,75 +0,0 @@
\name{gsa.plot.venn}
\alias{gsa.plot.venn}
\title{
Plot a proportional venn diagram
}
\description{
Plot a proportional venn diagram (two or three-way venns allowed)
}
\usage{
gsa.plot.venn(a, b, c = 0, a_and_b, a_and_c = 0, b_and_c = 0, col = c("#FF6342", "#63C6DE", "#ADDE63"), pos = c(0.2, 0.2, 0.8, 0.82), debug = 0)
}
\arguments{
\item{a}{
size of 'a' circle
}
\item{b}{
size of 'b' circle
}
\item{c}{
size of 'c' circle
}
\item{a_and_b}{
size of a and b overlap
}
\item{a_and_c}{
size of a and c overlap
}
\item{b_and_c}{
size of b and c overlap
}
\item{col}{
vector of colors for each venn piece
}
\item{pos}{
vector of positional elements
}
\item{debug}{
if 1, set debug mode and print useful information
}
}
\details{
Plots a two-way or three-way proportional Venn diagram. Internally, this method uses the Google Chart API to generate the diagram, then renders it into the plot window where it can be annotated in interesting ways.
}
\value{
%% ~Describe the value returned
%% If it is a LIST, use
%% \item{comp1 }{Description of 'comp1'}
%% \item{comp2 }{Description of 'comp2'}
%% ...
}
\references{
}
\author{
Kiran Garimella
}
\note{
%% ~~further notes~~
}
%% ~Make other sections like Warning with \section{Warning }{....} ~
\seealso{
%% ~~objects to See Also as \code{\link{help}}, ~~~
}
\examples{
## Plot a two-way Venn diagram
gsa.plot.venn(1000, 750, 0, 400);
## Plot a three-way Venn diagram
gsa.plot.venn(1000, 750, 900, 400, 650, 500);
}
% Add one or more standard keywords, see file 'KEYWORDS' in the
% R documentation directory.
\keyword{ ~kwd1 }
\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line

View File

@ -1,111 +0,0 @@
\name{gsa.read.eval}
\alias{gsa.read.eval}
\title{
Read a VariantEval file
}
\description{
Read a VariantEval file that's output in R format.
}
\usage{
gsa.read.eval(evalRoot)
}
%- maybe also 'usage' for other objects documented here.
\arguments{
\item{evalRoot}{
%% ~~Describe \code{evalRoot} here~~
}
}
\details{
%% ~~ If necessary, more details than the description above ~~
}
\value{
%% ~Describe the value returned
%% If it is a LIST, use
%% \item{comp1 }{Description of 'comp1'}
%% \item{comp2 }{Description of 'comp2'}
%% ...
}
\references{
%% ~put references to the literature/web site here ~
}
\author{
%% ~~who you are~~
}
\note{
%% ~~further notes~~
}
%% ~Make other sections like Warning with \section{Warning }{....} ~
\seealso{
%% ~~objects to See Also as \code{\link{help}}, ~~~
}
\examples{
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function(evalRoot) {
fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep="");
fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
eval = list(
AlleleCountStats = NA,
CompOverlap = NA,
CountVariants = NA,
GenotypeConcordance = NA,
MetricsByAc = NA,
MetricsBySample = NA,
Quality_Metrics_by_allele_count = NA,
QualityScoreHistogram = NA,
SampleStatistics = NA,
SampleSummaryStatistics = NA,
SimpleMetricsBySample = NA,
TiTv = NA,
TiTvStats = NA,
Variant_Quality_Score = NA,
CallsetNames = c(),
CallsetOnlyNames = c(),
CallsetFilteredNames = c()
);
eval$AlleleCountStats = .attemptToLoadFile(fileAlleleCountStats);
eval$CompOverlap = .attemptToLoadFile(fileCompOverlap);
eval$CountVariants = .attemptToLoadFile(fileCountVariants);
eval$GenotypeConcordance = .attemptToLoadFile(fileGenotypeConcordance);
eval$MetricsByAc = .attemptToLoadFile(fileMetricsByAc);
eval$MetricsBySample = .attemptToLoadFile(fileMetricsBySample);
eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count);
eval$QualityScoreHistogram = .attemptToLoadFile(fileQualityScoreHistogram);
eval$SampleStatistics = .attemptToLoadFile(fileSampleStatistics);
eval$SampleSummaryStatistics = .attemptToLoadFile(fileSampleSummaryStatistics);
eval$SimpleMetricsBySample = .attemptToLoadFile(fileSimpleMetricsBySample);
eval$TiTv = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
eval$TiTvStats = .attemptToLoadFile(fileTiTvStats);
eval$Variant_Quality_Score = .attemptToLoadFile(fileVariant_Quality_Score);
uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
eval$CallsetFilteredNames = as.vector(c());
eval;
}
}
% Add one or more standard keywords, see file 'KEYWORDS' in the
% R documentation directory.
\keyword{ ~kwd1 }
\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line

View File

@ -1,55 +0,0 @@
\name{gsa.read.gatkreport}
\alias{gsa.read.gatkreport}
\title{
gsa.read.gatkreport
}
\description{
Reads a GATKReport file - a multi-table document - and loads each table as a separate data.frame object in a list.
}
\usage{
gsa.read.gatkreport(filename)
}
\arguments{
\item{filename}{
The path to the GATKReport file.
}
}
\details{
The GATKReport format replaces the multi-file output format used by many GATK tools and provides a single, consolidated file format. This format accomodates multiple tables and is still R-loadable - through this function.
The file format looks like this:
\preformatted{##:GATKReport.v0.1 TableName : The description of the table
col1 col2 col3
0 0.007451835696110506 25.474613284804366
1 0.002362777171937477 29.844949954504095
2 9.087604507451836E-4 32.87590975254731
3 5.452562704471102E-4 34.498999090081895
4 9.087604507451836E-4 35.14831665150137
}
}
\value{
Returns a list object, where each key is the TableName and the value is the data.frame object with the contents of the table. If multiple tables with the same name exist, each one after the first will be given names of "TableName.v1", "TableName.v2", ..., "TableName.vN".
%% ~Describe the value returned
%% If it is a LIST, use
%% \item{comp1 }{Description of 'comp1'}
%% \item{comp2 }{Description of 'comp2'}
%% ...
}
\references{
%% ~put references to the literature/web site here ~
}
\author{
Kiran Garimella
}
\note{
%% ~~further notes~~
}
\seealso{
%% ~~objects to See Also as \code{\link{help}}, ~~~
}
\examples{
report = gsa.read.gatkreport("/path/to/my/output.gatkreport");
}
\keyword{ ~kwd1 }

View File

@ -1,48 +0,0 @@
\name{gsa.read.squidmetrics}
\alias{gsa.read.squidmetrics}
\title{
gsa.read.squidmetrics
}
\description{
Reads metrics for a specified SQUID project into a dataframe.
}
\usage{
gsa.read.squidmetrics("C315")
}
\arguments{
\item{project}{
The project for which metrics should be obtained.
}
\item{bylane}{
If TRUE, obtains per-lane metrics rather than the default per-sample metrics.
}
}
\details{
%% ~~ If necessary, more details than the description above ~~
}
\value{
%% ~Describe the value returned
%% If it is a LIST, use
%% \item{comp1 }{Description of 'comp1'}
%% \item{comp2 }{Description of 'comp2'}
%% ...
Returns a data frame with samples (or lanes) as the row and the metric as the column.
}
\references{
%% ~put references to the literature/web site here ~
}
\author{
Kiran Garimella
}
\note{
This method will only work within the Broad Institute internal network.
}
\seealso{
%% ~~objects to See Also as \code{\link{help}}, ~~~
}
\examples{
## Obtain metrics for project C315.
d = gsa.read.squidmetrics("C315");
}
\keyword{ ~kwd1 }

View File

@ -1,46 +0,0 @@
\name{gsa.warn}
\alias{gsa.warn}
\title{
GSA warn
}
\description{
Write a warning message to standard out with the prefix '[gsalib] Warning:'.
}
\usage{
gsa.warn(message)
}
%- maybe also 'usage' for other objects documented here.
\arguments{
\item{message}{
The warning message to write.
}
}
\details{
%% ~~ If necessary, more details than the description above ~~
}
\value{
%% ~Describe the value returned
%% If it is a LIST, use
%% \item{comp1 }{Description of 'comp1'}
%% \item{comp2 }{Description of 'comp2'}
%% ...
}
\references{
%% ~put references to the literature/web site here ~
}
\author{
Kiran Garimella
}
\note{
%% ~~further notes~~
}
\seealso{
%% ~~objects to See Also as \code{\link{help}}, ~~~
}
\examples{
## Write message to stdout
gsa.warn("This is a warning message");
}
\keyword{ ~kwd1 }
\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line

View File

@ -1,68 +0,0 @@
\name{gsalib-package}
\alias{gsalib-package}
\alias{gsalib}
\docType{package}
\title{
GATK utility analysis functions
}
\description{
Utility functions for analyzing GATK-processed NGS data
}
\details{
This package contains functions for working with GATK-processed NGS data. These functions include a command-line parser that also allows a script to be used in interactive mode (good for developing scripts that will eventually be automated), a proportional Venn diagram generator, convenience methods for parsing VariantEval output, and more.
}
\author{
Genome Sequencing and Analysis Group
Medical and Population Genetics Program
Maintainer: Kiran Garimella
}
\references{
GSA wiki page: http://www.broadinstitute.org/gsa/wiki
GATK help forum: http://www.getsatisfaction.com/gsa
}
\examples{
## get script arguments in interactive and non-interactive mode
cmdargs = gsa.getargs( list(
requiredArg1 = list(
value = NA,
doc = "Documentation for requiredArg1"
),
optionalArg1 = list(
value = 3e9,
doc = "Documentation for optionalArg1"
)
) );
## plot a proportional Venn diagram
gsa.plot.venn(500, 250, 0, 100);
## read a GATKReport file
report = gsa.gatk.report("/path/to/my/output.gatkreport");
## emit a message
gsa.message("This is a message");
## emit a warning message
gsa.message("This is a warning message");
## emit an error message
gsa.message("This is an error message");
## read the SQUID metrics for a given sequencing project (internal to the Broad only)
s = gsa.read.squidmetrics("C427");
## read command-line arguments
cmdargs = gsa.getargs(
list(
file = list(value="/my/test.vcf", doc="VCF file"),
verbose = list(value=0, doc="If 1, set verbose mode"),
test2 = list(value=2.3e9, doc="Another argument that does stuff")
),
doc="My test program"
);
}
\keyword{ package }

View File

@ -1,245 +0,0 @@
#Before executing this file, save squid files as csv, then as tab deliminated files with only the column values as the header, change the format of all cells to numbers. Assign the path to these files to "samples" and "lanes" respectively.
#testcomment
args<-commandArgs(TRUE)
lanes<-args[1]
samples<-args[2]
sample_sets<-args[3]
eval<-args[4]
noveltitv<-args[5]
knowntitv<-args[6]
DOC<-args[7]
if(is.na(sample_sets)){
print("Please specify sample set for file naming and press enter.")
scan("stdin", what="character",n=1)->sample_sets
print("Thanks!")
}
if(is.na(lanes) == FALSE && is.na(samples)==FALSE){
#this makes a table & graphs using Picard data
read.delim(file=lanes, header= TRUE)->bylane;
read.delim(file=samples, header= TRUE)->bysample;
#Calc by lane metrics
attach(bylane);
callable.target<-HS_TARGET_TERRITORY[1];
singlelanes<-length(which(Lane.Type=="Single"));
pairedlanes<-length(which(Lane.Type=="Paired"));
mean.read.lane<-signif(mean(AL_TOTAL_READS, na.rm=TRUE));
sd.read.lane<-signif(sd(AL_TOTAL_READS, na.rm=TRUE));
mean.ub.lane<-signif(mean(HS_ON_TARGET_BASES, na.rm=TRUE));
sd.ub.lane<-signif(sd(HS_ON_TARGET_BASES, na.rm=TRUE));
mean.cov.lane<-round(mean(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
sd.cov.lane<-round(sd(HS_MEAN_TARGET_COVERAGE, na.rm=TRUE));
mean.10x.lane<-round(mean(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
mean.20x.lane<-round(mean(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
mean.30x.lane<-round(mean(HS_PCT_TARGET_BASES_30X, na.rm=TRUE));
sd.10x.lane<-round(sd(HS_PCT_TARGET_BASES_10X, na.rm=TRUE));
sd.20x.lane<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
sd.30x.lane<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE));
names<-paste(Project, " ", External.ID, "-", Lane, sep="")
#makes a plot of the number of SNPS called per lane
library(graphics)
pdf(file=paste(sample_sets, "_SNPS.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS))
layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE)
plot(1:length(SNP_TOTAL_SNPS), main="SNPs Called in Each Lane", SNP_TOTAL_SNPS, xlab="", ylab="SNPs Called in Lane", xaxt="n", pch=16, col="blue")
axis(side=1, at=(1:length(SNP_TOTAL_SNPS)), labels=names, cex.axis=0.75, las=2)
boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called")
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){
mtext("No outliers", side=1, line=4)
}else{
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
}
dev.off()
#makes SNP plot in log scale
pdf(file=paste(sample_sets, "_SNPS_log.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS))
layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE)
plot(1:length(SNP_TOTAL_SNPS), log(SNP_TOTAL_SNPS), main="SNPs Called in Each Lane", xlab="", ylab="Log(SNPs Called in Lane)", xaxt="n", pch=16, col="blue")
par(ylog=TRUE)
axis(side=1, at=(1:length(SNP_TOTAL_SNPS)), labels=names, cex.axis=0.75, las=2)
boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called")
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){
mtext("No outliers", side=1, line=4)
}else{
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
}
dev.off()
#makes a plot of snp calls ordered by lane
pdf(file=paste(sample_sets, "_SNPS_lane.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS))
layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE)
plot(1:length(SNP_TOTAL_SNPS), SNP_TOTAL_SNPS[order(Lane)], main="SNPs Called in Each Lane", xlab="", ylab="Log(SNPs Called in Lane)", xaxt="n", pch=16, col="blue")
par(ylog=TRUE)
axis(side=1, at=(1:length(SNP_TOTAL_SNPS)), labels=names[order(Lane)], cex.axis=0.75, las=2)
boxplot(SNP_TOTAL_SNPS, main="SNPs Called in Lane", ylab="SNPs Called")
if(length(boxplot.stats(SNP_TOTAL_SNPS)$out)==0){
mtext("No outliers", side=1, line=4)
}else{
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
}
dev.off()
#makes a plot of fingerprint calls and labels them good or bad
badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15))
colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS)))
colors[badsnps]<-"Red"
pdf(file=paste(sample_sets, "_Fingerprints.pdf", sep=""), width=.2*length(FP_CONFIDENT_CALLS), height=.1*length(FP_CONFIDENT_CALLS))
par(mar=c(6, 4, 5, 4))
plot(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_MATCHING_SNPS, pch=16, ylim=c(0,24), ylab="Fingerprint calls", xlab="", xaxt="n", col=colors, main="Fingerprint Calling and Matching")
points(1:length(FP_CONFIDENT_MATCHING_SNPS), FP_CONFIDENT_CALLS, col=colors)
axis(side=1, at=(1:length(FP_CONFIDENT_CALLS)), labels=names, cex.axis=0.75, las=2)
if(length(badsnps)>0){
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes"), pch=c(1, 16, 1, 16), col=c("Blue", "Blue", "Red", "Red"))
mtext("Some problematic fingerprint sites", side=3)
}else{
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane"), pch=c(1, 16), col="Blue")
}
dev.off()
detach(bylane)
#Calc by sample metrics
attach(bysample);
mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE));
sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE));
mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE));
sd.mrl.samp<-signif(sd(Mean.Read.Length, na.rm=TRUE));
mean.read.samp<-signif(mean(Total.Reads, na.rm=TRUE));
sd.read.samp<-signif(sd(Total.Reads, na.rm=TRUE));
mean.ub.samp<-signif(mean(On.Target.Bases..HS., na.rm=TRUE));
sd.ub.samp<-signif(sd(On.Target.Bases..HS., na.rm=TRUE));
mean.cov.samp<-round(mean(Mean.Target.Coverage..HS., na.rm=TRUE));
sd.cov.samp<-round(sd(Mean.Target.Coverage..HS., na.rm=TRUE));
mean.10x.samp<-round(mean(PCT.Target.Bases.10x..HS., na.rm=TRUE));
mean.20x.samp<-round(mean(PCT.Target.Bases.20x..HS., na.rm=TRUE));
mean.30x.samp<-round(mean(PCT.Target.Bases.30x..HS., na.rm=TRUE));
sd.10x.samp<-round(sd(PCT.Target.Bases.10x..HS., na.rm=TRUE));
sd.20x.samp<-round(sd(PCT.Target.Bases.20x..HS., na.rm=TRUE));
sd.30x.samp<-round(sd(PCT.Target.Bases.30x..HS., na.rm=TRUE));
detach(bysample);
#print all of this stuff out in R.
print(paste("Callable Target: ", callable.target, " bases", sep=""), quote = FALSE);
print(paste("Used Lanes per Sample: ", mean.lanes.samp, " +/- ", sd.lanes.samp, sep=""), quote=FALSE);
print(paste("Parities: ", singlelanes, " single lanes, ", pairedlanes, " paired lanes", sep=""), quote=FALSE);
print(paste("Read Legnths: ", mean.mrl.samp, " +/- ", sd.mrl.samp, sep=""), quote = FALSE);
print(paste("Reads per lane: ", mean.read.lane, " +/- ", sd.read.lane, sep=""), quote = FALSE);
print(paste("Reads per sample: ", mean.read.samp, " +/- ", sd.read.samp, sep=""), quote = FALSE);
print(paste("Used bases per lane: ", mean.ub.lane, " +/- ", sd.ub.lane, sep=""), quote = FALSE);
print(paste("Used bases per sample: ", mean.ub.samp, " +/- ", sd.ub.samp, sep=""), quote = FALSE)
print(paste("Average target coverage per lane: ", mean.cov.lane, " +/- ", sd.cov.lane, sep=""), quote = FALSE);
print(paste("Average target coverage per sample: ", mean.cov.samp, " +/- ", sd.cov.samp, sep=""), quote = FALSE);
print(paste("% loci covered to 10x per lane: ", mean.10x.lane, "% +/- ", sd.10x.lane, "%", sep=""), quote = FALSE)
print(paste("% loci covered to 10x per sample: ", mean.10x.samp, " +/- ", sd.10x.samp, "%", sep=""), quote = FALSE)
print(paste("% loci covered to 20x per lane: ", mean.20x.lane, "% +/- ", sd.20x.lane, "%", sep=""), quote = FALSE)
print(paste("% loci covered to 20x per sample: ", mean.20x.samp, "% +/- ", sd.20x.samp, "%", sep=""), quote = FALSE)
print(paste("% loci covered to 30x per lane: ", mean.30x.lane, "% +/- ", sd.30x.lane, "%", sep=""), quote = FALSE)
print(paste("% loci covered to 30x per sample: ", mean.30x.samp, "% +/- ", sd.30x.samp, "%", sep=""), quote = FALSE)
}else{
print("Lane and Sample metrics file paths not provided")
}
#Makes Error Rate percycle graph
if(is.na(eval)==FALSE){
read.delim(eval, header=TRUE)[2:ncol(read.delim(eval, header=TRUE))]->errpercycle
pdf(paste(sample_sets, "_errorrate_per_cycle.pdf", sep=""), width=6, height=5)
crazies<-which(errpercycle[75,]>0.3) #this can be changed to any kind of filter for particular lanes
colors<-rainbow(ncol(errpercycle), s=0.5, v=0.5)
colors[crazies]<-rainbow(length(crazies))
weights<-rep(1, ncol(errpercycle))
weights[crazies]<-2
matplot(errpercycle, type="l", lty="solid", col=colors, lwd=weights, main="Error Rate per Cycle", ylab="Error Rate", xlab="Cycle", ylim=c(0, 0.7))
if(length(crazies)>0){
legend("topleft", title="Unusual Lanes", legend=colnames(errpercycle)[crazies], lty="solid", lwd=2, col=colors[crazies], xjust=0.5)
}else{
legend("topleft", legend="No unusual lanes.", bty="n")
}
dev.off()
}else{
print("Error Rate Per Cycle file paths not provided")
}
#Makes TI/TV known v novel graph
if(is.na(noveltitv)==FALSE && is.na(knowntitv) == FALSE){
pdf(paste(sample_set, "_TiTv.pdf", sep=""), width=6, height=5)
read.table(file=noveltitv, header=FALSE)->novels
read.table(file=knowntitv, header=FALSE)->knowns
plot(novels[,2], col="red", ylim=c(0, 3.5), main="Ti/Tv for Novel and Known SNP calls", ylab="Ti/Tv", xlab="", xaxt="n")
points(knowns[,2], col="blue")
axis(side=1, at=(1:length(novels[,2])), labels=novels[,1], cex.axis=1, las=2)
legend("bottomright", legend=c("Known Variants", "Novel Variants"), col=c("blue", "red"), pch=1, xjust=0.5)
mtext("Lower Ti/Tv ratios indicated more false positive SNP calls.", side=1)
dev.off()
}else{
print("Transition/transversion ratio file paths not provided")
}
#Make DOC graph
if(is.na(DOC)==FALSE){
pdf(paste(sample_set, "_DOC.pdf", sep=""), width=6, height=5)
as.matrix(as.vector(read.delim(DOC, header=TRUE)[,2:502]))->DOCdata
DOCdata<-matrix(DOCdata*100/sum(DOCdata[1,]), nrow=501, ncol=29, byrow=TRUE)
colnames(DOCdata)<-read.delim(DOC, header=TRUE)[,1]
oddies<-which(apply(DOCdata, 2, max)>10) #can be assigned any particular heuristic
ncolors<-rainbow(ncol(DOCdata), s=0.5, v=0.5)
ncolors[oddies]<-rainbow(length(oddies))
nweights<-rep(1, ncol(DOCdata))
nweights[oddies]<-2
matplot(DOCdata, type="l", main="Depth of Coverage by Sample", ylab="Percent bases covered to a given depth", xlab="log(Depth)", log="x", col=ncolors, lty="solid", lwd=nweights)
if(length(oddies)>0){
legend("topright", title="Unusual Cases", legend=colnames(DOCdata)[oddies], lty="solid", lwd=2, col=ncolors[oddies], xjust=0.5)
}else{
legend("topright", legend="No unusual cases.", bty="n")
}
dev.off()
}else{
print("Depth of Coverage filepath not provided")
}

View File

@ -1,138 +0,0 @@
titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
titvFPEstV <- function(titvExpected, titvs) {
sapply(titvs, function(x) titvFPEst(titvExpected, x))
}
calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
2 * TP / 3 / callable
}
marginalTiTv <- function( nx, titvx, ny, titvy ) {
tvx = nx / (titvx + 1)
tix = nx - tvx
tvy = ny / (titvy + 1)
tiy = ny - tvy
tiz = tix - tiy
tvz = tvx - tvy
return(tiz / tvz)
}
marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
knownx = nx * dbx / 100
novelx = nx - knownx
knowny = ny * dby / 100
novely = ny - knowny
knownz = knownx - knowny
novelz = novelx - novely
return(knownz / ( knownz + novelz ) * 100)
}
numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
}
normalize <- function(x) {
x / sum(x)
}
normcumsum <- function(x) {
cumsum(normalize(x))
}
cumhist <- function(d, ...) {
plot(d[order(d)], type="b", col="orange", lwd=2, ...)
}
revcumsum <- function(x) {
return(rev(cumsum(rev(x))))
}
phred <- function(x) {
log10(max(x,10^(-9.9)))*-10
}
pOfB <- function(b, B, Q) {
#print(paste(b, B, Q))
p = 1 - 10^(-Q/10)
if ( b == B )
return(p)
else
return(1 - p)
}
pOfG <- function(bs, qs, G) {
a1 = G[1]
a2 = G[2]
log10p = 0
for ( i in 1:length(bs) ) {
b = bs[i]
q = qs[i]
p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
log10p = log10p + log10(p1)
}
return(log10p)
}
pOfGs <- function(nAs, nBs, Q) {
bs = c(rep("a", nAs), rep("t", nBs))
qs = rep(Q, nAs + nBs)
G1 = c("a", "a")
G2 = c("a", "t")
G3 = c("t", "t")
log10p1 = pOfG(bs, qs, G1)
log10p2 = pOfG(bs, qs, G2)
log10p3 = pOfG(bs, qs, G3)
Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
}
QsampleExpected <- function(depth, Q) {
weightedAvg = 0
for ( d in 1:(depth*3) ) {
Qsample = 0
pOfD = dpois(d, depth)
for ( nBs in 0:d ) {
pOfnB = dbinom(nBs, d, 0.5)
nAs = d - nBs
Qsample = pOfGs(nAs, nBs, Q)$Qsample
#Qsample = 1
weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
}
}
return(weightedAvg)
}
plotQsamples <- function(depths, Qs, Qmax) {
cols = rainbow(length(Qs))
plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
for ( i in 1:length(Qs) ) {
Q = Qs[i]
y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
points(depths, y, col=cols[i], type="b")
}
legend("topleft", paste("Q", Qs), fill=cols)
}
pCallHetGivenDepth <- function(depth, nallelesToCall) {
depths = 0:(2*depth)
pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
dpois(depths,depth)*(1-pNoAllelesToCall)
}
pCallHets <- function(depth, nallelesToCall) {
sum(pCallHetGivenDepth(depth,nallelesToCall))
}
pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
1-(1-pCallHets(depth,nallelesToCall))^nsamples
}

View File

@ -1,120 +0,0 @@
count_zeros = function(list) {
zeros = 0
for (x in list) {
if (x == 0.0) {
zeros = zeros + 1
}
}
zeros
}
load = function(max_rows) {
files = list.files(path=".", pattern="304NA.*")
#max_rows = -1
#FREESTANDING as a filter
#HIT_TWICE for ZEROS...
print ("Parsing file 1")
t = read.table(files[1],header=T, nrows = max_rows)
f = data.frame(loc=t$location,gc=t$gc,freestanding=t$freestanding)
ht = data.frame(1:nrow(f))
for (file in files) {
print (file)
t = read.table(file, header=T, nrows = max_rows)
norm_cov = t$normalized_coverage
#names(norm_cov) = c("norm_cov.1")
f=cbind (f, norm_cov)
ht=cbind (ht, t$hit_twice)
}
wgs = read.table("/seq/dirseq/analysis/agilent/rt-pcr/perfdata//OV-0751-WGS.baits.coverage.txt", header=T, nrows = max_rows)
f=cbind (f, wgs_norm_cov = wgs$normalized_coverage)
f=cbind(f,ht)
# Compute normalized variance
print("Calculating variance")
var = apply(f[4:10], 1, var)
print("Calculating std. dev.")
sd = apply(f[4:10], 1, sd)
print("Calculating mean")
mean = apply(f[4:10], 1, mean)
print("Binding normalized variance")
f=cbind (f, normvar=var/mean/mean)
print("Binding normalized std. dev.")
f=cbind (f, normsd=sd/mean)
print("Binding mean")
f=cbind (f, mean=mean)
print("Binding std. dev.")
f=cbind (f, sd=sd)
print("Binding variance")
f=cbind (f, var=var)
print("Calculating and binding number of zeros")
count_zeros = apply(f[4:10], 1, count_zeros)
num_not_hit_twice = apply(f[12:18], 1, count_zeros)
f=cbind(f, count_zeros, num_not_hit_twice)
print ("Parsing sequences file")
seqs = read.table("whole_exome_agilent_designed_120.design.1line.sorted2",header=T,nrows=max_rows)
f=cbind (f, seqs)
#of = f[order(f$normvar),]
}
write_splits = function(f) {
set.seed(0987123409)
# Low variance
nz = f[f$count_zeros < 1 & f$freestanding==1,] # Take reads with no zeros
d = write_split(nz, "Low_GC_Norm_Coverage", 0.0, 0.35, 0.8, 1.2, 0.0, 0.3, 0.0)
d = rbind(d,write_split(nz, "Mid_GC_Norm_Coverage", 0.45, 0.55, 0.8, 1.2, 0.0, 0.1, 0.0))
d = rbind(d,write_split(nz, "High_GC_Norm_Coverage", 0.63, 1.0, 0.8, 1.2, 0.0, 0.3, 0.0))
d = rbind(d,write_split(nz, "Low_GC_Undercovered", 0.0, 0.35, 0.2, 0.3, 0.0, 0.3, 0.0))
d = rbind(d,write_split(nz, "Mid_GC_Undercovered", 0.45, 0.55, 0.2, 0.3, 0.0, 0.3, 0.0))
d = rbind(d,write_split(nz, "High_GC_Undercovored", 0.63, 1.0, 0.2, 0.3, 0.0, 0.3, 0.0))
az = f[f$count_zeros == 7 & f$freestanding==1,] # Take reads with all zeros
d = rbind(d,write_split(az, "Low_GC_No_Coverage", 0.0, 0.35, 0.0, 0.1, -1.0, -1.0, 0.1))
d = rbind(d,write_split(az, "Mid_GC_No_Coverage", 0.45, 0.55, 0.0, 0.1, -1.0, -1.0, 0.1))
d = rbind(d,write_split(az, "High_GC_No_Coverage", 0.63, 1.0, 0.0, 0.1, -1.0, -1.0, 0.01))
# High variance
d = rbind(d,write_split(nz, "Mid_GC_Norm_Coverage_High_Variation", 0.45, 0.55, 0.8, 1.2, 0.355, 1000.0))
d
}
write_split = function(data, label, gc_low, gc_high, cov_low, cov_high, normsd_low, normsd_high, wgs_cov_low) {
if (normsd_high < 0.0) {
# We have no coverage samples
s = data[data$gc >= gc_low & data$gc <= gc_high & data$mean >= cov_low & data$mean <= cov_high & data$wgs_norm_cov >= wgs_cov_low,]
#s = s[order(runif(nrow(s))),] # Randomize rows
s = s[order(s$wgs_norm_cov, decreasing = T),] # order according to norm SD
}else{
# We have low or normal coverage samples, so take those with tightest norm SDs
s = data[data$gc >= gc_low & data$gc <= gc_high & data$mean >= cov_low & data$mean <= cov_high & data$normsd >= normsd_low & data$normsd <= normsd_high ,]
s = s[order(s$normsd),] # order according to norm SD
}
# & data$mean < 1.1 & data$mean > 0.9,]
# & data$mean >= cov_low & data$mean <= cov_high
#print(s)
print(nrow(s))
s = s[1:50, ] #-c(3,11,12:18,19,23:25)]
s = cbind(class=rep(label,50), s)
s
}
#f=load()
#nz=f[f$count_zeros < 1,]
#print(summary(nz))
create_500 = function(f) {
f = load(-1)
s = write_splits(f)
write.csv(s, "500_exome_baits_for_nanostring.csv")
}

View File

@ -1,27 +0,0 @@
plot1 <- function(d, name) {
d = subset(d, dataset == name)
subd = data.frame(parallel.type=d$parallel.type, nWaysParallel=d$nWaysParallel, end.to.end.time=d$end.to.end.time,per.1M.sites = d$per.1M.sites, job.run.time = d$job.run.time)
nways = unique(subd$nWaysParallel)
m = max(subset(subd, nWaysParallel == min(nways))$end.to.end.time)
nNW = subset(subd, end.to.end.time == m)$nWaysParallel[1]
timeAt1 = m * nNW
my.runtime = subset(subd, end.to.end.time == m)$job.run.time[1] * nNW
my.pms = subset(subd, end.to.end.time == m)$per.1M.sites[1]
theo = data.frame(parallel.type="theoretic", end.to.end.time=timeAt1/nways, nWaysParallel=nways, per.1M.sites = my.pms, job.run.time = my.runtime / nways)
subd = rbind(subd, theo)
print(summary(subd))
print(xyplot(log10(end.to.end.time) + per.1M.sites + log10(job.run.time) ~ log2(nWaysParallel), data=subd[order(subd$nWaysParallel),], group=parallel.type, type="b", outer=T, scale=list(relation="free"), auto.key=T, lwd=c(2,2,1), main=name))
return(subd)
}
myData <- read.table("results.new.dat", header=T)
require("lattice")
for (name in unique(d$dataset))
plot1(myData, name)

View File

@ -1,121 +0,0 @@
import sys
from optparse import OptionParser
from itertools import *
import random
import re
import datetime
# a simple script that does:
# 1 -- generates a master set of variants following the neutral expectation from a single big population
# 2 -- randomly generates M individuals with variants and genotypes sampled as expected from the big population of variants
# 3 -- writes out the genotypes of these individuals, and their allele frequency
def main():
global OPTIONS
usage = "usage: %prog [options] outputFile"
parser = OptionParser(usage=usage)
(OPTIONS, args) = parser.parse_args()
if len(args) == 0:
parser.error("Requires at least one argument")
print 'file dataset parallel.type nWaysParallel start.time end.time end.to.end.time per.1M.sites job.run.time'
typere = '.*/(.*).ptype_(\w+).nways_(\d+).*'
for file in args:
startTime, endTime, perMSites, runtime = None, None, None, None
for line in open(file):
match = re.match(typere, line)
if match != None: dataset, parallelType, nWays = match.groups()
startTime = captureStartTime(line, startTime)
perMSites = capturePerMSites(line, perMSites)
endTime = captureEndTime(line, endTime)
runtime = captureRuntime(line, runtime)
print file, dataset, parallelType, nWays, formatTime(startTime), formatTime(endTime), endToEnd(endTime, startTime), perMSites, runtime
def endToEnd(endTime, startTime):
if endTime < startTime:
endTime = endTime + datetime.timedelta(1)
#print 'endToEnd', endTime, startTime
return total_minutes(endTime - startTime)
def formatTime(t):
return datetime.datetime.strftime(t, formatString)
def total_minutes(td):
return td.days * 24 * 60 + td.seconds / 60.0
def captureLine(line, regex, func, prevValue):
match = regex.match(line)
if match != None:
if func != None:
val = func(line)
else:
val = match.group(1)
else:
val = None
#print 'Matching', line, regex, match, prevValue, val
return val
formatString = "%H:%M:%S"
def captureStartTime(line, prev):
# todo - needs to find the earliest time
#INFO 11:03:50,202 HelpFormatter - The Genome Analysis Toolkit (GATK) v<unknown>, Compiled <unknown>
regex = re.compile("INFO\W*(\d+:\d+:\d+).*The Genome Analysis Toolkit.*")
return selectTime(captureLine(line, regex, None, prev), prev, earlier = True)
def selectTime(newTimeString, oldTime, earlier = False):
def select():
if newTimeString == None:
return oldTime
else:
newTime = datetime.datetime.strptime(newTimeString, formatString)
if oldTime == None:
return newTime
elif earlier:
if newTime < oldTime:
return newTime
else:
return oldTime
else:
if newTime > oldTime:
return newTime
else:
return oldTime
r = select()
#if not earlier: print 'selectTime', oldTime, newTimeString, r
return r
def captureEndTime(line, prev):
# todo - needs to find the latest time
regex = re.compile("INFO\W*(\d+:\d+:\d+).*GATKRunReport - Aggregating data for run report.*")
return selectTime(captureLine(line, regex, None, prev), prev, earlier=False)
unitsToMinutes = {
'm' : 1.0,
'h' : 60,
's' : 1.0/60,
'd' : 60 * 60
}
def capturePerMSites(line, prev):
return captureDoneLine(line, prev, 8, 10)
def captureRuntime(line, prev):
return captureDoneLine(line, prev, 6, 8)
def captureDoneLine(line, prev, s, e):
# INFO 11:04:11,541 TraversalEngine - chr1:3769010 1.32e+05 20.0 s 2.5 m 1.5% 21.9 m 21.5 m
regex = re.compile("INFO .*TraversalEngine -.*done*")
val = captureLine(line, regex, lambda x: x.split()[s:e], None)
if val == None:
return prev
else:
x, u = val
return float(x) * unitsToMinutes[u]
if __name__ == "__main__":
main()

View File

@ -1,200 +0,0 @@
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
import org.broadinstitute.sting.queue.QScript
import org.apache.commons.io.FilenameUtils;
class DistributedGATKPerformance extends QScript {
qscript =>
@Argument(shortName="gatk", doc="gatk jar file", required=true)
var gatkJarFile: File = _
@Argument(shortName="outputDir", doc="output directory", required=false)
var outputDir: String = ""
@Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
var datasets: List[String] = Nil
@Argument(shortName="waysParallel", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
var waysParallelArg: List[Int] = Nil
@Argument(shortName="long", doc="runs long calculations", required=false)
var long: Boolean = false
@Argument(shortName="test", doc="runs long calculations", required=false)
var test: Boolean = false
@Argument(shortName="limitTo30Min", doc="runs long calculations", required=false)
var limitTo30Min: Boolean = false
@Argument(shortName="huge", doc="runs long calculations", required=false)
var huge: Int = -1
@Argument(shortName="justDist", doc="runs long calculations", required=false)
var justDist: Boolean = false
@Argument(shortName="justSG", doc="runs long calculations", required=false)
var justSG: Boolean = false
@Argument(shortName="trackerDir", doc="root directory for distributed tracker files", required=false)
var trackerDir: String = "" // "/humgen/gsa-scr1/depristo/tmp/"
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "DEBUG"; jarFile = gatkJarFile; memoryLimit = 2; }
class Target(
val baseName: String,
val reference: File,
val dbsnpFile: String,
val hapmapFile: String,
val maskFile: String,
val bamList: File,
val goldStandard_VCF: File,
val intervals: String,
val titvTarget: Double,
val isLowpass: Boolean,
val useBAQ: Boolean) {
val name = qscript.outputDir + baseName
val clusterFile = new File(name + ".clusters")
def rawVCF(part: String) = new File(name + "." + part + ".raw.vcf")
val filteredVCF = new File(name + ".filtered.vcf")
val titvRecalibratedVCF = new File(name + ".titv.recalibrated.vcf")
val tsRecalibratedVCF = new File(name + ".ts.recalibrated.vcf")
val goldStandardName = qscript.outputDir + "goldStandard/" + baseName
val goldStandardClusterFile = new File(goldStandardName + ".clusters")
}
val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta")
val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val dbSNP_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_hg18.rod"
val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_b36.rod"
val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf"
val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf"
val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf"
val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"
val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed"
val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed"
// ToDos:
// reduce the scope of the datasets so the script is more nimble
// figure out how to give names to all the Queue-LSF logs (other than Q-1931@node1434-24.out) so that it is easier to find logs for certain steps
// create gold standard BAQ'd bam files, no reason to always do it on the fly
// Analysis to add at the end of the script:
// auto generation of the cluster plots
// spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls
// produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF
val lowPass: Boolean = true
val targetDataSets: Map[String, Target] = Map(
"HiSeq" -> new Target("NA12878.HiSeq", hg18, dbSNP_hg18, hapmap_hg18,
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask",
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"),
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"),
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/whole_genome_chunked.hg18.intervals", 2.07, !lowPass, true),
"FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"),
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass, true),
"WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18, hapmap_hg18,
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask",
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"),
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"),
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, !lowPass, true),
"TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/Barcoded_1000G_WEx_Reduced_Plate_1.20.cleaned.list"), // BUGBUG: reduce from 60 to 20 people
new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass, true),
"LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36,
new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from
new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, lowPass,true), // chunked interval list to use with Queue's scatter/gather functionality
"LowPassAugust" -> new Target("ALL.august.v4", b37, dbSNP_b37, hapmap_b37, indelMask_b37, // BUGBUG: kill this, it is too large
new File("/humgen/1kg/processing/allPopulations_chr20_august_release.cleaned.merged.bams/ALL.cleaned.merged.list"),
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass, true),
"LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"),
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/distributedGATK/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass,false),
"WExTrio" -> new Target("NA12878Trio.WEx", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bams.list"),
new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass, true)
)
def getTargetInterval(target: Target): List[String] = target.name match {
case "NA12878.HiSeq" => List("chr1")
case "FIN" => List("20")
case "ALL.august.v4" => List("20")
case "EUR.nov2010" => List("20")
case _ => List(target.intervals)
}
def script = {
// Selects the datasets in the -dataset argument and adds them to targets.
var targets: List[Target] = List()
if (!datasets.isEmpty)
for (ds <- datasets)
targets ::= targetDataSets(ds) // Could check if ds was mispelled, but this way an exception will be thrown, maybe it's better this way?
else // If -dataset is not specified, all datasets are used.
for (targetDS <- targetDataSets.valuesIterator) // for Scala 2.7 or older, use targetDataSets.values
targets ::= targetDS
val nWays = if ( test ) List(32) else { if ( long ) List(1,2,4,8) else if ( huge != -1 ) List(huge) else List(16,32,64,128) }
//val nWays = List(2)
for (target <- targets) {
for ( scatterP <- if ( test ) List(false) else if ( justSG ) List(true) else if ( justDist ) List(false) else List(true, false) )
for (nWaysParallel <- nWays ) {
val aname = "ptype_%s.nways_%d".format(if ( scatterP ) "sg" else "dist", nWaysParallel)
def addUG(ug: UnifiedGenotyper) = {
if ( ! long )
ug.jobLimitSeconds = 60 * 60 * 4
if ( limitTo30Min )
ug.jobLimitSeconds = 60 * 30
add(ug);
}
// add scatter/gather or distributed parallelism
if ( scatterP ) {
var ug: UnifiedGenotyper = new UnifiedGenotyper(target, aname)
ug.scatterCount = nWaysParallel
ug.intervalsString ++= List(target.intervals)
addUG(ug)
} else {
for ( part <- 1 to nWaysParallel) {
var ug: UnifiedGenotyper = new UnifiedGenotyper(target, aname + ".part" + part)
ug.intervalsString ++= getTargetInterval(target)
ug.processingTracker = new File(trackerDir + target.name + "." + aname + ".distributed.txt")
ug.processingTrackerID = part
if ( part == 1 )
ug.performanceLog = new File("%s.%s.pf.log".format(target.name, aname))
ug.processingTrackerStatusFile = new File("%s.%s.%d.ptstatus.log".format(target.name, aname, part))
addUG(ug)
}
}
}
}
}
// 1.) Call SNPs with UG
class UnifiedGenotyper(t: Target, aname: String) extends org.broadinstitute.sting.queue.extensions.gatk.UnifiedGenotyper with UNIVERSAL_GATK_ARGS {
this.reference_sequence = t.reference
this.dcov = if ( t.isLowpass ) { 50 } else { 250 }
this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
this.input_file :+= t.bamList
this.out = t.rawVCF(aname)
this.baq = if (t.useBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF}
this.analysisName = t.name + "_UG." + aname
if (t.dbsnpFile.endsWith(".rod"))
this.DBSNP = new File(t.dbsnpFile)
else if (t.dbsnpFile.endsWith(".vcf"))
this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
}
}

View File

@ -1,3 +0,0 @@
#d <- read.table("../GATK/trunk/timer.dat", header=T)
require("lattice")
print(xyplot(elapsed.time + delta ~ cycle | name, data=d, scales=list(relation="free"), auto.key=T, type="b", outer=T))

View File

@ -1 +0,0 @@
grep -l -e "ptype_sg" -e "part1\." short/Q-*.out long/Q-*.out > toTime.txt

View File

@ -1 +0,0 @@
echo "63025520" | awk '{ for(i = 0; i < $1; i += 100000) {print "20:" i+1 "-" (i+100000 < $1 ? i+100000 : $1)}}' > whole_genome_chunked.chr20.hg19.intervals

View File

@ -1,34 +0,0 @@
JOB_START_RATE = 0.1 # chance of starting is 0.1
WORK_UNITS = 100
WORK_RATE = 1
N_TICKS = 300
ticks <- 1:N_TICKS
# the probability that a job starts at exactly tick i
pThreadStartAtTick <- function(i) {
dexp(i, JOB_START_RATE)
}
jobDoneByI <- function(i) {
return(sapply(i - ticks, function(x) max(x, 0)) * WORK_RATE)
#return(pCompleteAtI(i, pStarts, ticks))
}
pThreadDoneByI <- function(i) {
pStarts <- pThreadStartAtTick(ticks)
workDoneByThreadStartingAtI <- jobDoneByI(i)
fracDone <- workDoneByThreadStartingAtI / WORK_UNITS
doneAtI <- fracDone >= 1
return(sum(pStarts * doneAtI))
}
pThreadsDoneByI <- function(i, nThreads) {
pDone <- rep(0, N_TICKS)
for ( thread : 1:nThreads )
pDone <- pPrevThreadsNotDoneAtI(pDone, i) + pThreadDoneByI(i)
}
#plot(ticks, workDoneByI(100))
plot(ticks, sapply(ticks, function(i) pThreadDoneByI(i)))

View File

@ -1,11 +0,0 @@
#!/bin/tcsh
setenv CMD "java -Djava.io.tmpdir=/broad/shptmp/depristo/tmp -jar /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/Queue.jar -statusTo depristo -S /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/analysis/depristo/distributedGATK/distributedGATKPerformance.scala -bsub --gatkjarfile /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar -dataset HiSeq $argv[2-$#argv]"
if ( $1 == 1 ) then
pushd short; $CMD -jobQueue hour -run &
else if ( $1 == 2 ) then
pushd long; $CMD -jobQueue gsa -long -run &
else
$CMD
endif

View File

@ -1,40 +0,0 @@
require("lattice")
require("ggplot2")
require("splines")
ymax = xmax = 30
HAVE_RAW_DATA = F
if ( HAVE_RAW_DATA ) {
inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table"
#inputDataFile = "~/Dropbox/Analysis/genotypeAccuracy/cgl.table.gz"
r <- digestTable(inputDataFile)
d = r$d
eByComp = r$eByComp
countsByTech = addEmpiricalPofG(ddply(d, .(ref, alt, technology, pGGivenDType, pGGivenD), genotypeCounts))
print(qplot(pGGivenD, EmpiricalPofGQ, data=subset(countsByTech, technology=="HiSeq-paper" & pGGivenDType == "QofABGivenD"), facets = alt ~ ref, color=alt, geom=c("point"), group=alt, xlim=c(0,xmax), ylim=c(0,ymax))
+ geom_abline(slope=1, linetype=2))
# + geom_smooth(se=T, size=1.5, aes(weight=Sum)))
} else {
eByComp = read.table("~/Dropbox/GSA members/Analysis/genotypeAccuracy/NA12878.hm3.vcf.cgl.table.eByComp.tsv", header=T)
}
#print(subset(countsByTech, pGGivenD > 18 & pGGivenD < 22 & pGGivenDType == "QofABGivenD"))
#print(subset(eByComp, EmpiricalPofGQ < Inf))
goodEByComp = subset(eByComp, Sum > 10 & EmpiricalPofGQ < Inf)
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, size=log10(Sum), facets = pGGivenDType ~ technology, color=pGGivenDType, geom=c("point", "smooth"), group=pGGivenDType, xlim=c(0,xmax), ylim=c(0,ymax)) + geom_abline(slope=1, linetype=2))
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(0,ymax))
+ geom_abline(slope=1, linetype=2)
+ geom_smooth(se=F, aes(weight=Sum)))
print(qplot(pGGivenD, pGGivenD - EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ technology, color=rg, geom=c("blank"), group=rg, xlim=c(0,xmax), ylim=c(-10,10))
+ geom_abline(slope=0, linetype=2)
+ geom_smooth(se=F, method=lm, formula = y ~ ns(x,1), aes(weight=Sum)))
# By tech
print(qplot(pGGivenD, EmpiricalPofGQ, data=goodEByComp, facets = pGGivenDType ~ ., color=technology, geom=c("blank"), group=technology, xlim=c(0,xmax), ylim=c(0,ymax))
+ geom_abline(slope=1, linetype=2)
+ geom_smooth(se=T, size=1.5, aes(weight=Sum)))

View File

@ -1,62 +0,0 @@
#!/bin/env Rscript
require("ggplot2")
args <- commandArgs(TRUE)
verbose = TRUE
inputDataFile = args[1]
onCmdLine = ! is.na(inputDataFile)
addEmpiricalPofG <- function(d) {
r = c()
#
# TODO -- this is a really naive estimate of the accuracy, as it assumes the comp
# track is perfect. In reality the chip is at best Q30 accurate (replicate samples have
# level than this level of concordance). At low incoming confidence, we can effectively
# ignore this term but when the incoming Q is near or above Q30 this approximation clearly
# breaks down.
#
for ( i in 1:dim(d)[1] ) {
row = d[i,]
if ( row$pGGivenDType == "QofAAGivenD" ) v = row$HOM_REF
if ( row$pGGivenDType == "QofABGivenD" ) v = row$HET
if ( row$pGGivenDType == "QofBBGivenD" ) v = row$HOM_VAR
r = c(r, v / row$Sum)
}
#print(length(r))
d$EmpiricalPofG = r
d$EmpiricalPofGQ = round(-10*log10(1-r))
return(d)
}
genotypeCounts <- function(x) {
type = unique(x$variable)[1]
t = addmargins(table(x$comp))
return(t)
}
digestTable <- function(inputDataFile) {
d = subset(read.table(inputDataFile, header=T), rg != "ALL")
d$technology <- factor(1, levels=c("HiSeq-paper", "GA2-1000G", "HiSeq-recent"))
d$technology[grepl("ERR.*", d$rg)] <- "GA2-1000G"
d$technology[grepl("20.*", d$rg)] <- "HiSeq-paper"
d$technology[grepl("B00EG.*", d$rg)] <- "HiSeq-recent"
print(summary(d$technology))
eByComp = addEmpiricalPofG(ddply(d, .(rg, technology, pGGivenDType, pGGivenD), genotypeCounts))
return(list(d=d, eByComp = eByComp))
#countsByTech = addEmpiricalPofG(ddply(d, .(technology, pGGivenDType, pGGivenD), genotypeCounts))
}
writeMyTable <- function(t, name) {
write.table(t,file=paste(inputDataFile, ".", name, ".tsv", sep=""))
}
if ( onCmdLine ) {
r <- digestTable(inputDataFile)
writeMyTable(r$eByComp, "eByComp")
}

View File

@ -1,12 +0,0 @@
{
"Statement": [
{
"Sid": "Stmt1296439478068",
"Action": [
"s3:PutObject"
],
"Effect": "Allow",
"Resource": "arn:aws:s3:::GATK_Run_Reports/*"
}
]
}

View File

@ -1,2 +0,0 @@
AKIAJXU7VIHBPDW4TDSQ
uQLTduhK6Gy8mbOycpoZIxr8ZoVj1SQaglTWjpYA

View File

@ -1,8 +0,0 @@
{
"Statement":[{
"Effect":"Allow",
"Action":"*",
"Resource":"*"
}
]
}

View File

@ -1,12 +0,0 @@
{
"Statement": [
{
"Sid": "Stmt1296439478068",
"Action": [
"s3:PutObject"
],
"Effect": "Allow",
"Resource": "arn:aws:s3:::IGV_crowdsourcing/*"
}
]
}

View File

@ -1,2 +0,0 @@
AKIAIM64MSUYNQ2465HQ
D+l3HfPQFWia9HF8rKh/fJ5+yNYsltWUpj0C7L0Z

View File

@ -1,45 +0,0 @@
#!/bin/tcsh
# download CLI tools
# http://aws.amazon.com/developertools/AWS-Identity-and-Access-Management/4143
setenv JAVA_HOME /usr/
setenv AWS_IAM_HOME ~/Downloads/IAMCli-1.1.0
setenv PATH $AWS_IAM_HOME/bin:$PATH
setenv AWS_CREDENTIAL_FILE /Users/depristo/Desktop/broadLocal/GATK/trunk/account-key
setenv CREATE_GROUPS false
setenv CREATE_IGV_USER false
setenv UPDATE_USER_KEYS false
setenv UPDATE_USER_POLICY true
# Create the administrators group:
# we aren't actually using this, in fact
if ( $CREATE_GROUPS == true ) then
iam-groupcreate -g Admins
iam-grouplistbypath
iam-groupuploadpolicy -g Admins -p AdminsGroupPolicy -f GroupPolicy.txt
iam-grouplistpolicies -g Admins
endif
# Create the IGV user -- uncomment if the IGV user needs to be created from scratch
# update the secret key
if $CREATE_IGV_USER == true then
iam-usercreate -u IGV -k -v > IGV_cred.txt
endif
# the user access and secret keys are in the IGV source file IGVRunReport.java
# and must be updated to be the most current ones
if $UPDATE_USER_KEYS == true then
iam-userdelkey -u IGV -k $1 # $1 -> current access key
iam-useraddkey -u IGV > IGV_cred.txt
cat IGV_cred.txt
endif
echo "IGV user policies"
if $UPDATE_USER_POLICY == true then
echo "Deleting policy"
iam-userdelpolicy -u IGV -p IGVRunReportUploading
iam-useruploadpolicy -u IGV -p IGVRunReportUploading -f IGVPolicy.txt
endif
iam-userlistpolicies -u IGV -v

View File

@ -1,45 +0,0 @@
#!/bin/tcsh
# download CLI tools
# http://aws.amazon.com/developertools/AWS-Identity-and-Access-Management/4143
setenv JAVA_HOME /usr/
setenv AWS_IAM_HOME ~/Downloads/IAMCli-1.1.0
setenv PATH $AWS_IAM_HOME/bin:$PATH
setenv AWS_CREDENTIAL_FILE /Users/depristo/Desktop/broadLocal/GATK/trunk/account-key
setenv CREATE_GROUPS false
setenv CREATE_GATK_USER false
setenv UPDATE_USER_KEYS false
setenv UPDATE_USER_POLICY true
# Create the administrators group:
# we aren't actually using this, in fact
if ( $CREATE_GROUPS == true ) then
iam-groupcreate -g Admins
iam-grouplistbypath
iam-groupuploadpolicy -g Admins -p AdminsGroupPolicy -f GroupPolicy.txt
iam-grouplistpolicies -g Admins
endif
# Create the GATK user -- uncomment if the GATK user needs to be created from scratch
# update the secret key
if $CREATE_GATK_USER == true then
iam-usercreate -u GATK -k -v > GATK_cred.txt
endif
# the user access and secret keys are in the GATK source file GATKRunReport.java
# and must be updated to be the most current ones
if $UPDATE_USER_KEYS == true then
iam-userdelkey -u GATK -k $1 # $1 -> current access key
iam-useraddkey -u GATK > GATK_cred.txt
cat GATK_cred.txt
endif
echo "GATK user policies"
if $UPDATE_USER_POLICY == true then
echo "Deleting policy"
iam-userdelpolicy -u GATK -p GATKRunReportUploading
iam-useruploadpolicy -u GATK -p GATKRunReportUploading -f GATKPolicy.txt
endif
iam-userlistpolicies -u GATK -v

View File

@ -1,41 +0,0 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
input = args[1]
t=read.table(input, header=T)
#t=read.csv(input)
#par(mfrow=c(2,1), cex=1.2)
#outfile = paste(input, ".quality_emp_v_stated.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
pdf(outfile, height=7, width=7)
d.good <- t[t$nMismatches >= 1000,]
d.100 <- t[t$nMismatches < 100,]
d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,]
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,63), ylim=c(0,63), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16)
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16)
abline(0,1, lty=2)
dev.off()
#outfile = paste(input, ".quality_emp_hist.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Empirical quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()
#
# Plot Q reported histogram
#
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(t$Qreported, t$nBases), t.nBases != 0)
plot(hst$t.Qreported, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Reported quality score histogram", xlab="Qreported quality score", ylab="Count", yaxt="n")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()

View File

@ -1,20 +0,0 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
#X11(width=7, height=14)
#outfile = paste(input, ".qual_diff_v_cycle.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
outfile = paste(input, ".qual_diff_v_cycle.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
c <- read.table(input, header=T)
d.good <- c[c$nMismatches >= 100,]
d.100 <- c[c$nMismatches < 100,]
plot(d.good$Cycle, d.good$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", col="blue", ylim=c(-10, 10))
points(d.100$Cycle, d.100$Qempirical_Qreported, type="p", col="lightblue", pch=3)
#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16)

View File

@ -1,16 +0,0 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
#outfile = paste(input, ".qual_diff_v_dinuc.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
outfile = paste(input, ".qual_diff_v_dinuc.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
#in_dinuc = paste(input, ".quality_difference_v_dinucleotide.csv", sep="")
#d <- read.csv(input)
d <- read.table(input, header=T)
plot(d$Dinuc, d$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Dinucleotide", ylim=c(-10,10))

View File

@ -1,273 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.varianteval;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.report.tags.Analysis;
import org.broadinstitute.sting.utils.report.tags.DataPoint;
import org.broadinstitute.sting.utils.report.utils.TableType;
import java.util.Arrays;
import java.util.Collection;
import java.util.Set;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Nov 22, 2010
* Time: 12:22:08 PM
* To change this template use File | Settings | File Templates.
*/
@Analysis(name = "ACTransitionMatrix", description = "Number of additional genotypes from each new sample; random permutations")
public class ACTransitionTable extends VariantEvaluator {
private final int NUM_PERMUTATIONS = 50;
private final double LOW_GQ_PCT = 0.95;
private final double LOW_GQ_THRSH = 30.0;
private boolean initialized = false;
private long skipped = 0l;
@DataPoint(name="Het transitions",description="AC[s] = AC[s-1]+1 and AC[s] = AC[s-1]+2 transitions")
TransitionTable transitions = null;
@DataPoint(name="Private permutations",description="Marginal increase in number of sites per sample")
PermutationCounts privatePermutations;
@DataPoint(name="AC2 Permutations",description="Marginal increase in number of AC=2 sites, per sample")
PermutationCounts doubletonPermutations;
@DataPoint(name="AC3 Permutations",description="Marginal increase in number of tripleton sites, per sample")
PermutationCounts tripletonPermutations;
String[][] permutations;
public boolean enabled() {
return true;
}
public int getComparisonOrder() {
return 2;
}
public String getName() {
return "ACTransitionTable";
}
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( eval != null && ! initialized ) {
//this.veWalker.getLogger().warn("Initializing...");
initialize(eval);
initialized = true;
}
if ( isGood(eval) ) {
if ( comp != null && ! comp.isFiltered() ) {
return null;
}
int order_offset = 0;
for ( String[] ordering : permutations ) {
int sample_offset = 0;
int variant_ac = 0;
for ( String sample : ordering ) {
if ( eval.getGenotype(sample).isHet() ) {
variant_ac++;
transitions.hetTransitionCounts[order_offset][variant_ac-1][sample_offset]++;
} else if ( eval.getGenotype(sample).isHomVar() ) {
variant_ac += 2;
transitions.homTransitionCounts[order_offset][variant_ac-1][sample_offset]++;
} else {
// todo -- note, unclear how to treat no calls. Is the hom in het,ref,ref,nocall,hom sample 4 or 5?
// todo -- do we want to tabulate P[sample i is not variant | some variant]? This is just combinatorics so i left it out
if ( variant_ac > 0 ) {
transitions.stationaryCounts[order_offset][variant_ac-1][sample_offset]++;
}
}
sample_offset ++;
}
order_offset++;
}
} else {
skipped++;
}
return null;
}
private boolean isGood(VariantContext vc) {
if ( vc == null || vc.isFiltered() || (vc.getHetCount() + vc.getHomVarCount() == 0) ) { // todo -- should be is variant, but need to ensure no alt alleles at ref sites
return false;
} else {
Collection<Genotype> gtypes = vc.getGenotypes().values();
int ngood = 0;
for ( Genotype g : gtypes) {
if ( g.isCalled() && g.getPhredScaledQual() >= LOW_GQ_THRSH ) {
ngood ++;
}
}
return ( (0.0+ngood)/(0.0+gtypes.size()) >= LOW_GQ_PCT );
}
}
public ACTransitionTable(VariantEvalWalker parent) {
//super(parent);
}
public void initialize(VariantContext vc) {
Set<String> permuteSamples = vc.getSampleNames();
permutations = new String[NUM_PERMUTATIONS][permuteSamples.size()];
//veWalker.getLogger().warn(String.format("Num samples: %d",permuteSamples.size()));
int offset = 0;
for ( String s : permuteSamples ) {
permutations[0][offset] = s;
offset ++;
}
for ( int p = 1; p < NUM_PERMUTATIONS ; p++ ) {
permutations[p] = permutations[0].clone();
for ( int o = 0; o < permutations[p].length; o ++ ) {
int r = (int) Math.floor(Math.random()*(o+1));
String swap = permutations[p][r];
permutations[p][r] = permutations[p][o];
permutations[p][o] = swap;
}
}
transitions = new TransitionTable();
transitions.hetTransitionCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()*2][permuteSamples.size()];
transitions.homTransitionCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()*2][permuteSamples.size()];
transitions.stationaryCounts = new int[NUM_PERMUTATIONS][permuteSamples.size()*2][permuteSamples.size()];
privatePermutations = new PermutationCounts(1,transitions);
doubletonPermutations = new PermutationCounts(2,transitions);
tripletonPermutations = new PermutationCounts(3,transitions);
}
public void finalizeEvaluation() { // note: data points are null when this is called (wtf?)
//veWalker.getLogger().info(String.format("Skipped: %d",skipped));
}
class TransitionTable implements TableType {
int[][][] hetTransitionCounts;
int[][][] homTransitionCounts;
int[][][] stationaryCounts;
String[][] countAverages;
String[] rowKeys = null;
String[] colKeys = null;
public Object[] getRowKeys() {
if ( rowKeys == null ) {
rowKeys = new String[3*hetTransitionCounts[0].length];
for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) {
rowKeys[i] = String.format("%s%d%s","AC_",i,"_(het)");
}
for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) {
rowKeys[hetTransitionCounts[0].length+i] = String.format("%s%d%s","AC_",i,"_(hom)");
}
for ( int i = 0; i < hetTransitionCounts[0].length; i ++ ) {
rowKeys[2*hetTransitionCounts[0].length+i] = String.format("%s%d%s","AC_",i,"_(ref)");
}
}
return rowKeys;
}
public String getCell(int x, int y) {
if ( countAverages == null ) {
countAverages = new String[hetTransitionCounts[0].length*3][hetTransitionCounts[0][0].length];
for ( int sam = 0; sam < hetTransitionCounts[0][0].length; sam ++) {
for ( int idx = 0 ; idx < hetTransitionCounts[0].length; idx ++ ) {
int totalTimesAtACSample = 0;
int totalStationary = 0;
int totalAC1Shift = 0;
int totalAC2Shift = 0;
for ( int p = 0; p < hetTransitionCounts.length; p++ ) {
totalStationary += stationaryCounts[p][idx][sam];
totalAC2Shift += (idx+2 >= hetTransitionCounts[0][0].length) ? 0 : homTransitionCounts[p][idx+2][sam];
totalAC1Shift += (idx+1 >= hetTransitionCounts[0][0].length) ? 0 : hetTransitionCounts[p][idx+1][sam];
}
totalTimesAtACSample = totalStationary+totalAC1Shift+totalAC2Shift;
countAverages[idx][sam] = formatProp(totalAC1Shift,totalTimesAtACSample);
countAverages[hetTransitionCounts[0].length+idx][sam] = formatProp(totalAC2Shift,totalTimesAtACSample);
countAverages[hetTransitionCounts[0].length*2+idx][sam] = formatProp(totalStationary,totalTimesAtACSample);
}
}
}
return countAverages[x][y] == null ? "0.00" : countAverages[x][y];
}
private String formatProp(int num, int denom) {
return (denom != 0) ? String.format("%.4f", ((double) num)/denom) : "0.0";
}
public String getName() { return "AC Transition Tables"; }
public Object[] getColumnKeys() {
if ( colKeys == null ) {
colKeys = new String[hetTransitionCounts[0][0].length];
for ( int ac = 0; ac < hetTransitionCounts[0][0].length; ac ++ ) {
colKeys[ac] = String.format("Sample_%d",ac);
}
}
return colKeys;
}
}
class PermutationCounts implements TableType {
int acToExtract;
TransitionTable table;
String[] rowNames;
String[] colNames;
public PermutationCounts(int ac, TransitionTable tTable) {
acToExtract = ac;
table = tTable;
}
public String[] getRowKeys() {
//System.out.printf("%s%n",table);
if ( rowNames == null ) {
rowNames = new String[table.stationaryCounts.length];
for ( int p = 0 ; p < rowNames.length; p ++ ) {
rowNames[p] = String.format("Perm%d",p+1);
}
}
return rowNames;
}
public String[] getColumnKeys() {
if ( colNames == null ) {
colNames = new String[table.stationaryCounts[0][0].length];
for ( int s = 0 ; s < colNames.length; s ++ ) {
colNames[s] = String.format("Sample%d",s+1);
}
}
return colNames;
}
public Integer getCell(int x, int y) {
return table.hetTransitionCounts[x][acToExtract-1][y] +
( (acToExtract > table.homTransitionCounts[0][0].length) ? 0 : table.homTransitionCounts[x][acToExtract-1][y]);
}
public String getName() {
return String.format("PermutationCountsAC%d",acToExtract);
}
public void init() {
getRowKeys();
getColumnKeys();
getCell(1,1);
}
}
}

View File

@ -1,85 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.diagnostics;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RodGenotypeChipAsGFF;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.BaseUtils;
/**
* Takes a BAM file and a Hapmap-chip file (via the -hc argument) and creates a table of reference allele
* percentage and alternate allele percentage for het, homvar, and other genotypes.
*/
public class AlleleBalanceInspector extends LocusWalker<Integer, Integer> {
private int item = 1;
public void initialize() {
out.printf("item\tlocus\tref\tgenotype\tstate\tdepth\trefdepth\taltdepth\trefpct\taltpct%n");
}
public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
RodGenotypeChipAsGFF hc = tracker.lookup("child",RodGenotypeChipAsGFF.class);
return hc != null && hc.getCalledGenotype().isVariant(ref.getBase());
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
RodGenotypeChipAsGFF hc = tracker.lookup("child",RodGenotypeChipAsGFF.class);
String state;
if (hc.getCalledGenotype().isHet()) {
state = "het";
} else if (hc.getCalledGenotype().isHom()) {
state = "homvar";
} else {
state = "other";
}
int refIndex = ref.getBaseIndex();
int altIndex = -1;
for (char base : hc.getCalledGenotype().getBases().toCharArray()) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
if (baseIndex != refIndex) {
altIndex = baseIndex;
}
}
int[] baseCounts = context.getPileup().getBaseCounts();
double sum = (double) (baseCounts[refIndex] + baseCounts[altIndex]);
double refPct = ((double) baseCounts[refIndex])/sum;
double altPct = ((double) baseCounts[altIndex])/sum;
out.printf("%d\t%s\t%c\t%s\t%s\t%d\t%d\t%d\t%f\t%f%n",
item++,
context.getLocation(),
ref.getBase(),
hc.getCalledGenotype().getBases(),
state,
context.getPileup().getReads().size(),
baseCounts[refIndex],
baseCounts[altIndex], refPct, altPct);
return null;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
public Integer reduceInit() {
return null; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
public Integer reduce(Integer value, Integer sum) {
return null; //To change body of implemented methods use File | Settings | File Templates.
}
}

View File

@ -1,212 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.varianteval;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.report.tags.DataPoint;
import org.broadinstitute.sting.utils.report.utils.TableType;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
*
*/
@Analysis(name = "Allele Frequency Comparison", description = "Compare allele frequency and counts between eval and comp")
public class AlleleFrequencyComparison extends VariantEvaluator {
private static int MAX_AC_COUNT = 100; // todo -- command line argument?
@DataPoint(description="Counts of eval frequency versus comp frequency")
AFTable afTable = new AFTable();
@DataPoint(description="Counts of eval AC versus comp AC")
ACTable acTable = new ACTable(MAX_AC_COUNT);
public boolean enabled() { return true; }
public int getComparisonOrder() { return 2; }
public String getName() { return "Allele Frequency Comparison"; }
public AlleleFrequencyComparison(VariantEvalWalker parent) {
//super(parent);
}
//public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) {
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( ! (isValidVC(eval) && isValidVC(comp)) ) {
return null;
} else {
// todo -- this is a godawful hack. The "right way" isn't working, so do it the unsafe way for now. Note that
// todo -- this precludes getting the AC/AF values from the info field because some may not be there...
/*if ( missingField(eval) ) {
recalculateCounts(eval);
}
if ( missingField(comp) ) {
recalculateCounts(comp);
}*/
HashMap<String,Object> evalCounts = new HashMap<String,Object>(2);
HashMap<String,Object> compCounts = new HashMap<String,Object>(2);
VariantContextUtils.calculateChromosomeCounts(eval,evalCounts,false);
VariantContextUtils.calculateChromosomeCounts(comp,compCounts,false);
afTable.update(((List<Double>)evalCounts.get("AF")).get(0),((List<Double>)compCounts.get("AF")).get(0));
acTable.update(((List<Integer>)evalCounts.get("AC")).get(0),((List<Integer>)compCounts.get("AC")).get(0));
}
return null; // there is nothing interesting
}
private static boolean missingField(final VariantContext vc) {
return ! ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) );
}
private void recalculateCounts(VariantContext vc) {
Map<String,Object> attributes = new HashMap<String,Object>();
VariantContextUtils.calculateChromosomeCounts(vc,attributes,false);
vc = VariantContext.modifyAttributes(vc,attributes);
//getLogger().debug(String.format("%s %s | %s %s",attributes.get("AC"),attributes.get("AF"),vc.getAttribute("AC"),vc.getAttribute("AF")));
if ( attributes.size() == 2 && missingField(vc) ) {
throw new org.broadinstitute.sting.utils.exceptions.StingException("VariantContext should have had attributes modified but did not");
}
}
private static boolean isValidVC(final VariantContext vc) {
return (vc != null && !vc.isFiltered() && vc.getAlternateAlleles().size() == 1);
}
private static double getAF(VariantContext vc) {
Object af = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY);
if ( af == null ) {
//throw new UserException("Variant context "+vc.getName()+" does not have allele frequency entry which is required for this walker");
// still none after being re-computed; this is 0.00
return 0.00;
} else if ( List.class.isAssignableFrom(af.getClass())) {
return ( (List<Double>) af ).get(0);
} else if ( String.class.isAssignableFrom(af.getClass())) {
// two possibilities
String s = (String) af;
try {
if ( s.startsWith("[") ) {
return Double.parseDouble(s.replace("\\[","").replace("\\]",""));
} else {
return Double.parseDouble(s);
}
} catch (NumberFormatException e) {
throw new UserException("Allele frequency field may be improperly formatted, found AF="+s,e);
}
} else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) {
return (Double) af;
} else {
throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",af.toString(),af.getClass()));
}
}
private static int getAC(VariantContext vc) {
Object ac = vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY);
if ( ac == null ) {
// still none after being re computed; this is 0
return 0;
} else if ( List.class.isAssignableFrom(ac.getClass())) {
return ( (List<Integer>) ac ).get(0);
} else if ( String.class.isAssignableFrom(ac.getClass())) {
// two possibilities
String s = (String) ac;
try {
if ( s.startsWith("[") ) {
return Integer.parseInt(s.replace("\\[","").replace("\\]",""));
} else {
return Integer.parseInt(s);
}
} catch (NumberFormatException e) {
throw new UserException(String.format("Allele count field may be improperly formatted, found AC=%s for record %s:%d",ac,vc.getChr(),vc.getStart()),e);
}
} else if ( Integer.class.isAssignableFrom(ac.getClass())) {
return (Integer) ac;
} else {
throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",ac.toString(),ac.getClass()));
}
}
}
class AFTable implements TableType {
protected int[][] afCounts = new int[101][101];
public Object[] getRowKeys() {
String[] afKeys = new String[101];
for ( int f = 0; f < 101; f ++ ) {
afKeys[f] = String.format("%.2f",(f+0.0)/100.0);
}
return afKeys;
}
public Object[] getColumnKeys() {
return getRowKeys(); // nice thing about symmetric tables
}
public Object getCell(int i, int j) {
return afCounts[i][j];
}
public String getName() {
return "Allele Frequency Concordance";
}
public void update(double eval, double comp) {
afCounts[af2index(eval)][af2index(comp)]++;
}
private int af2index(double d) {
return (int) Math.round(100*d);
}
}
class ACTable implements TableType {
protected int[][] acCounts;
protected int maxAC;
public ACTable(int acMaximum) {
maxAC = acMaximum;
acCounts = new int[acMaximum+1][acMaximum+1];
}
public Object[] getRowKeys() {
String[] acKeys = new String[maxAC+1];
for ( int i = 0 ; i <= maxAC ; i ++ ) {
acKeys[i] = String.format("%d",i);
}
return acKeys;
}
public Object[] getColumnKeys() {
return getRowKeys();
}
public Object getCell(int i, int j) {
return acCounts[i][j];
}
public String getName() {
return "Allele Counts Concordance";
}
public void update(int eval, int comp) {
eval = eval > maxAC ? maxAC : eval;
comp = comp > maxAC ? maxAC : comp;
acCounts[eval][comp]++;
}
}

View File

@ -1,219 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.varianteval;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.tags.DataPoint;
import org.broadinstitute.sting.utils.report.utils.TableType;
import org.broadinstitute.sting.utils.analysis.AminoAcid;
import org.broadinstitute.sting.utils.analysis.AminoAcidTable;
import org.broadinstitute.sting.utils.analysis.AminoAcidUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author chartl
* @since June 28, 2010
*/
@Analysis(name = "Amino Acid Transition", description = "Calculates the Transition Matrix for coding variants; entries are Total, Num. Ti, Num. Tv, Ratio")
public class AminoAcidTransition extends VariantEvaluator {
////////////////////////////////////////////////////////////
//// INTERNAL DATA POINT CLASSES
////////////////////////////////////////////////////////////
// a mapping from amino acid transition score histogram bin to Ti/Tv ratio
@DataPoint(description = "TiTv counts by amino acid change")
AminoAcidTiTvTable acidTable = null;
class TiTvCount {
public int ti;
public int tv;
public TiTvCount() {
ti = 0;
tv = 0;
}
public int getTotal() {
return ti + tv;
}
public double getRatio() {
return ( (double) ti )/(1.0+tv);
}
public String toString() {
return String.format("%d:%d:%d:%.2f",getTotal(),ti,tv,getRatio());
}
}
class AminoAcidTiTvTable implements TableType {
private TiTvCount[][] countsByAAChange;
public AminoAcidTiTvTable() {
countsByAAChange = new TiTvCount[AminoAcid.values().length][AminoAcid.values().length];
for ( int i = 0; i < AminoAcid.values().length; i ++ ) {
for ( int j = 0; j < AminoAcid.values().length; j++ ) {
countsByAAChange[i][j] = new TiTvCount();
}
}
}
public Object[] getRowKeys() {
return AminoAcidUtils.getAminoAcidCodes();
}
public Object[] getColumnKeys() {
return AminoAcidUtils.getAminoAcidCodes();
}
public TiTvCount getCell(int x, int y) {
return countsByAAChange[x][y];
}
public String getName() {
return "AminoAcidTransitionTable";
}
public void update(AminoAcid reference, AminoAcid alternate, boolean isTransition) {
TiTvCount counter = countsByAAChange[reference.ordinal()][alternate.ordinal()];
if ( isTransition ) {
counter.ti++;
} else {
counter.tv++;
}
}
}
////////////////////////////////////////////////////////////
//// CORE VARIANT EVALUATOR DATA AND METHODS
////////////////////////////////////////////////////////////
private String infoKey;
private String infoValueSplit;
private boolean useCodons;
private boolean enabled;
private AminoAcidTable lookup;
public AminoAcidTransition(VariantEvalWalker parent) {
//super(parent);
//enabled = parent.aminoAcidTransitionKey != null;
enabled = true;
if ( enabled ) {
getParsingInformation(parent);
lookup = new AminoAcidTable();
acidTable = new AminoAcidTiTvTable();
}
}
private void getParsingInformation(VariantEvalWalker parent) {
if ( enabled() ) {
// infoKey = parent.aminoAcidTransitionKey;
// infoValueSplit = parent.aminoAcidTransitionSplit;
// useCodons = parent.aatUseCodons;
infoKey = null;
infoValueSplit = null;
useCodons = false;
if ( infoKey == null ) {
throw new UserException.CommandLineException("No info-field key provided for amino acid tabulation. Please provide the appropriate key with -aatk.");
}
if ( infoValueSplit == null ) {
throw new UserException.CommandLineException("No split string provided for amino acid tabulation. Please provide the split string with -aats");
}
}
}
public String getName() {
return "AminoAcidTransitionTable";
}
public int getComparisonOrder() {
return 1; // we only need to see each eval track
}
public boolean enabled() {
return enabled;
}
public String toString() {
return getName();
}
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
String interesting = null;
//if ( eval != null && eval.hasAttribute(infoKey) ) {
if ( enabled && eval != null && eval.hasAttribute(infoKey) ) {
String[] parsedNames = ( (String) eval.getAttribute(infoKey)).split(infoValueSplit);
String first = "none";
String second = "none";
try {
first = parsedNames [0];
second = parsedNames [1];
} catch (ArrayIndexOutOfBoundsException e) {
//getLogger().warn("Error parsing variant context with value "+eval.getAttribute(infoKey));
}
AminoAcid reference;
AminoAcid alternate;
if ( useCodons ) {
reference = lookup.getEukaryoticAA(first);
alternate = lookup.getEukaryoticAA(second);
} else {
reference = lookup.getAminoAcidByCode(first);
alternate = lookup.getAminoAcidByCode(second);
}
//veWalker.getLogger().info(String.format("%s\t%s\t%s\t%s",first,second,reference,alternate));
if ( reference == null ) {
interesting = "Unknown Reference Codon";
} else if ( alternate == null ) {
interesting = "Unknown Alternate Codon";
} else {
acidTable.update(reference,alternate, VariantContextUtils.isTransition(eval));
}
}
return interesting; // This module doesn't capture any interesting sites, so return null
}
//public void finalizeEvaluation() {
//
//}
}

View File

@ -1,518 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.File;
import java.util.*;
import java.io.PrintStream;
import java.io.FileNotFoundException;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Oct 12, 2009
* Time: 2:43:06 PM
* To change this template use File | Settings | File Templates.
*/
@By(DataSource.REFERENCE)
@Reference(window=@Window(start=-3,stop=3))
public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Set<BaseTransitionTable>,Set<BaseTransitionTable>> implements TreeReducible<Set<BaseTransitionTable>> {
@Output
PrintStream out;
@Argument(fullName="usePreviousBases", doc="Use previous bases of the reference as part of the calculation, uses the specified number, defaults to 0", required=false)
int nPreviousBases = 0;
@Argument(fullName="useSecondaryBase",doc="Use the secondary base of a read as part of the calculation", required=false)
boolean useSecondaryBase = false;
@Argument(fullName="confidentRefThreshold",doc="Set the lod score that defines confidence in ref, defaults to 4", required=false)
int confidentRefThreshold = 5;
@Argument(fullName="maxNumMismatches",doc="Set the maximum number of mismatches at a locus before choosing not to use it in calculation. Defaults to 1.", required=false)
int maxNumMismatches = 1;
@Argument(fullName="minMappingQuality", doc ="Set the alignment quality below which to ignore reads; defaults to 30", required = false)
int minMappingQuality = 30;
@Argument(fullName="minQualityScore", doc = "Set the base quality score below which to ignore bases in the pileup, defaults to 20", required = false)
int minQualityScore = 20;
@Argument(fullName="usePileupMismatches", doc = "Use the number of mismatches in the pileup as a condition for the table", required=false)
boolean usePileupMismatches = false;
@Argument(fullName="usePreviousReadBases", doc="Use previous bases of the read as part of the calculation. Will ignore reads if there aren't this many previous bases. Uses the specified number. Defaults to 0", required=false)
int nPreviousReadBases = 0;
@Argument(fullName="useReadGroup", doc="Use the group number of the read as a condition of the table.", required = false)
boolean useReadGroup = false;
@Argument(fullName="outputFile", shortName="of", doc="Output to this file rather than standard out. Must be used with -nt.", required = false)
String outFilePath = null;
@Argument(fullName="forcePreviousReadBasesToMatchRef", doc="Forces previous read bases to match the reference", required = false)
boolean readBasesMustMatchRef = false;
private UnifiedGenotyperEngine ug;
// private ReferenceContextWindow refWindow;
// private Set<BaseTransitionTable> conditionalTables;
private List<Boolean> usePreviousBases;
private List<GenomeLoc> previousBaseLoci;
public void initialize() {
if ( nPreviousBases > 3 || ( nPreviousReadBases > 3 && readBasesMustMatchRef ) ) {
throw new UserException.CommandLineException("You have opted to use a number of previous bases in excess of 3. In order to do this you must change the reference window size in the walker itself.");
}
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
uac.baseModel = BaseMismatchModel.THREE_STATE;
uac.ALL_BASES_MODE = true;
ug = new UnifiedGenotyperEngine(getToolkit(), uac);
// refWindow = new ReferenceContextWindow(nPreviousBases);
usePreviousBases = new ArrayList<Boolean>();
previousBaseLoci = new ArrayList<GenomeLoc>();
}
public Set<BaseTransitionTable> reduceInit() {
return new TreeSet<BaseTransitionTable>();
}
public Set<BaseTransitionTable> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
ReadBackedPileup pileup = context.getBasePileup();
Set<BaseTransitionTable> newCounts = null;
//System.out.println(pileup.getBases());
if ( baseIsUsable(tracker, ref, pileup, context) ) {
//System.out.println("Pileup will be used");
if ( previousLociCanBeUsed(usePreviousBases,previousBaseLoci,context.getLocation()) ) {
for ( int r = 0; r < pileup.getReads().size(); r ++ ) {
if ( useRead ( pileup.getReads().get(r), pileup.getOffsets().get(r), ref ) ) {
newCounts = updateTables( newCounts, pileup.getReads().get(r), pileup.getOffsets().get(r), ref, pileup );
}
}
} else {
updatePreviousBases(usePreviousBases,true,previousBaseLoci,context.getLocation() );
}
} else {
updatePreviousBases( usePreviousBases,false,previousBaseLoci,context.getLocation() );
}
return newCounts;
}
public Set<BaseTransitionTable> reduce ( Set<BaseTransitionTable> map, Set<BaseTransitionTable> reduce ) {
if ( map != null && ! map.isEmpty() ) {
for ( BaseTransitionTable t : map ) {
boolean add = true;
for ( BaseTransitionTable r : reduce ) {
if ( r.conditionsMatch(t) ) {
r.incorporateTable(t);
add = false;
break;
}
}
if ( add ) {
reduce.add(t);
}
}
}
// System.out.println("Reduce: size of TransitionTable set is " + reduce.size() + " -- size of Map: " + (map != null ? map.size() : "null"));
return reduce;
}
public Set<BaseTransitionTable> treeReduce( Set<BaseTransitionTable> reduce1, Set<BaseTransitionTable> reduce2 ) {
// check to see if this is a truly tree-reducable calculation
if ( nPreviousBases >= 1 ) {
String errMsg = "Parallelization cannot be used with UsePreviousBases due to the fact that internal walker data specifies whether a previous reference base is usable or not.";
String errMsg2 = " This can cause cause concurrency issues and unpredictable behavior when used with parallelization. Either do not specify -nt, or try a the conjunction of ";
String errMsg3 = "--usePreviousReadBases and --forcePreviousReadBasesToMatchRef.";
throw new UserException.CommandLineException(errMsg+errMsg2+errMsg3);
}
return reduce(reduce1,reduce2);
}
public void onTraversalDone( Set<BaseTransitionTable> conditionalTables ) {
PrintStream output;
if ( outFilePath == null ) {
output = out;
} else {
try {
output = new PrintStream(outFilePath);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(new File(outFilePath), e);
}
}
output.print(createHeaderFromConditions());
for ( BaseTransitionTable t : conditionalTables )
t.print(output);
}
public void updatePreviousBases(List<Boolean> usage, boolean canUse, List<GenomeLoc> loci, GenomeLoc locus) {
// early return
if ( nPreviousBases < 1 ) {
return;
}
if ( usage.size() <= nPreviousBases ) {
usage.add(canUse);
loci.add(locus);
} else {
usage.remove(0);
usage.add(canUse);
loci.remove(0);
loci.add(locus);
}
}
public boolean previousLociCanBeUsed( List<Boolean> canUse, List<GenomeLoc> loci, GenomeLoc locus ) {
if ( nPreviousBases < 1 ) {
return true;
}
boolean use = true;
for ( boolean b : canUse ) {
use = use && b;
}
if ( use ) {
use = use && ( loci.get(0).distance(locus) == 1 ); // truly is PREVIOUS base
}
return use;
}
public Set<BaseTransitionTable> updateTables ( Set<BaseTransitionTable> tables, SAMRecord read, int offset, ReferenceContext ref, ReadBackedPileup pileup ) {
List<Comparable> readConditions = buildConditions(read,offset,ref, pileup);
// System.out.println("Updating table with pileup: "+pileup.getBases()+ ( read.getReadNegativeStrandFlag() ? "-" : "+" ) + " Quality: "+read.getBaseQualities()[offset] + " MapQ: "+read.getMappingQuality());
if ( tables == null ) {
tables = new TreeSet<BaseTransitionTable>();
}
boolean createNewTable = true;
for ( BaseTransitionTable t : tables ) {
if ( t.conditionsMatch(readConditions) ) {
updateTable(t,read,offset,ref);
createNewTable = false;
break;
}
}
if ( createNewTable ) {
BaseTransitionTable t = new BaseTransitionTable(readConditions);
updateTable(t,read,offset,ref);
tables.add(t);
}
return tables;
}
public void updateTable(BaseTransitionTable t, SAMRecord r, int o, ReferenceContext ref) {
// System.out.println("Update Table");
if ( r.getReadNegativeStrandFlag() ) {
t.update((byte)BaseUtils.simpleComplement((char) r.getReadBases()[o]), (byte)BaseUtils.simpleComplement(ref.getBaseAsChar()));
} else {
t.update(r.getReadBases()[o], ref.getBase());
}
}
public boolean useRead( SAMRecord read, int offset, ReferenceContext ref ) {
if ( Character.toUpperCase(read.getReadBases()[offset]) == Character.toUpperCase(ref.getBase()) ) {
return false;
} else if ( read.getMappingQuality() <= minMappingQuality ) {
return false;
} else if ( ! BaseUtils.isRegularBase( (char) read.getReadBases()[offset]) ) {
return false;
} else if ( read.getBaseQualities()[offset] <= minQualityScore ) {
return false;
} else if ( useSecondaryBase && read.getAttribute("SQ") == null ) {
return false;
} else if ( nPreviousBases >= 1 && previousReadBasesMismatchRef(read, offset, ref) ) {
return false;
} else if ( nPreviousReadBases >= 1 && readLacksPreviousBases(read,offset,nPreviousReadBases) ) {
return false;
} else if ( nPreviousReadBases >= 1 && readBasesMustMatchRef && previousReadBasesMismatchRef(read, offset, ref) ) {
return false;
} else {
return true;
}
}
public boolean previousReadBasesMismatchRef( SAMRecord read, int offset, ReferenceContext ref ) {
int c = read.getReadNegativeStrandFlag() ? 1 : -1;
if ( offset + nPreviousBases*c < 0 ) {
return true;
} else if ( offset + nPreviousBases*c > read.getReadLength() ) {
return true;
}
for ( int prevBase = 1; prevBase <= nPreviousBases; prevBase ++ ) {
if ( Character.toUpperCase(read.getReadBases()[offset + prevBase*c]) != Character.toUpperCase(ref.getBases()[nPreviousBases+1+prevBase*c]) || ! BaseUtils.isRegularBase(ref.getBases()[nPreviousBases+1+prevBase*c])) {
return true;
}
}
return false;
}
public boolean readLacksPreviousBases( SAMRecord read, int offset, int prevBases ) {
if ( ! read.getReadNegativeStrandFlag() ) {
return offset - prevBases < 0;
} else {
return offset + prevBases + 1 >= read.getReadLength();
}
}
public List<Comparable> buildConditions( SAMRecord read, int offset, ReferenceContext ref, ReadBackedPileup pileup ) {
ArrayList<Comparable> conditions = new ArrayList<Comparable>();
if ( nPreviousBases > 0 ) {
conditions.add(buildRefString(ref,nPreviousBases, ! read.getReadNegativeStrandFlag()));
}
if ( useSecondaryBase ) {
conditions.add(getSecondaryBase(read,offset));
}
if ( nPreviousReadBases > 0 ) {
conditions.add(buildReadString(read, offset, nPreviousReadBases));
}
if ( usePileupMismatches ) {
conditions.add(countMismatches(ref.getBase(), pileup));
}
if ( useReadGroup ) {
conditions.add(read.getReadGroup().getReadGroupId());
}
return conditions;
}
public String buildRefString(ReferenceContext ref, int bases, boolean forwardRead) {
if ( forwardRead ) {
return ( new String(ref.getBases()) ).substring(0,nPreviousBases-1);
} else {
return BaseUtils.simpleReverseComplement( ( new String(ref.getBases()) ).substring(nPreviousBases+1) );
}
}
public String buildReadString( SAMRecord read, int offset, int nPreviousReadBases ) {
if ( ! read.getReadNegativeStrandFlag() ) {
return read.getReadString().substring(offset-nPreviousReadBases,offset);
} else {
return BaseUtils.simpleReverseComplement( read.getReadString().substring(offset+1,offset+nPreviousReadBases+1) );
}
}
public String createHeaderFromConditions() {
String header = "Observed_base\tTrue_base";
if ( nPreviousBases > 0) {
header = header+"\tPrevious_"+nPreviousBases+"_bases";
}
if ( useSecondaryBase ) {
header = header + "\tSecondary_base";
}
if ( nPreviousReadBases > 0 ) {
header = header + "\tPrevious_"+nPreviousReadBases+"_read_bases";
}
if ( usePileupMismatches ) {
header = header + "\tNumber_of_pileup_mismatches";
}
if ( useReadGroup ) {
header = header + "\tRead_group";
}
return String.format("%s\t%s%n",header,"Counts");
}
public int countMismatches(byte ref, ReadBackedPileup p) {
int refM = p.getBaseCounts()[BaseUtils.simpleBaseToBaseIndex(ref)];
return p.size()-refM;
}
public char getSecondaryBase ( SAMRecord read, int offset ) {
return BaseUtils.baseIndexToSimpleBaseAsChar(QualityUtils.compressedQualityToBaseIndex( ( (byte[]) read.getAttribute("SQ") )[offset] ) );
}
public boolean baseIsUsable ( RefMetaDataTracker tracker, ReferenceContext ref, ReadBackedPileup pileup, AlignmentContext context ) {
return pileupContainsNoNs(pileup) && baseIsConfidentRef(tracker,ref,context) && pileupBelowMismatchThreshold(ref,pileup);
}
public boolean pileupBelowMismatchThreshold( ReferenceContext ref, ReadBackedPileup pileup ) {
return countMismatches(ref.getBase(), pileup) <= maxNumMismatches;
}
public boolean pileupContainsNoNs(ReadBackedPileup pileup) {
for ( byte c : pileup.getBases() ) {
if ( c == 'N' ) {
return false;
}
}
return true;
}
public boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if ( !BaseUtils.isRegularBase(ref.getBase()) )
return false;
VariantCallContext calls = ug.calculateLikelihoodsAndGenotypes(tracker,ref,context);
if ( calls == null || calls.vc == null)
return false;
return ( calls.vc.getNSamples() > 0 && calls.vc.getGenotype(0).isHomRef() && calls.vc.getGenotype(0).getNegLog10PError() > confidentRefThreshold );
}
}
class BaseTransitionTable implements Comparable {
/*
* no direct manipulation of these objects ever
*/
private int[][] table;
private List<Comparable> conditions;
public BaseTransitionTable(List<Comparable> conditions) {
table = new int[BaseUtils.BASES.length][BaseUtils.BASES.length];
for ( int i = 0; i < BaseUtils.BASES.length; i ++ ) {
for ( int j = 0; j < BaseUtils.BASES.length; j ++ ) {
table[i][j]=0;
}
}
this.conditions = conditions;
}
public boolean conditionsMatch(Object obj) {
if ( obj == null ) {
return false;
} else if ( obj instanceof BaseTransitionTable ) {
return ((BaseTransitionTable) obj).conditionsMatch(conditions);
} else if ( ! (obj instanceof List) ) {
return false;
} else if ( this.numConditions() != ((List)obj).size() ){
return false;
} else {
boolean eq = true;
ListIterator thisIter = this.getConditionIterator();
ListIterator thatIter = ((List)obj).listIterator();
while ( thisIter.hasNext() ) {
eq = eq && thisIter.next().equals(thatIter.next());
}
return eq;
}
}
public int compareTo(Object obj) {
if ( ! ( obj instanceof BaseTransitionTable ) ) {
return -1;
} else {
BaseTransitionTable t = (BaseTransitionTable) obj;
if ( this.conditionsMatch(t.conditions) ) {
return 0;
} else {
if ( this.numConditions() == t.numConditions() ) {
ListIterator<Comparable> thisIter = this.conditions.listIterator();
ListIterator<Comparable> thatIter = t.conditions.listIterator();
int g = 0;
do {
g = thisIter.next().compareTo(thatIter.next());
} while ( g == 0 );
return g;
} else {
return (this.numConditions() > t.numConditions() ) ? 1 : -1;
}
}
}
}
public void print( PrintStream out ) {
StringBuilder s = new StringBuilder();
for ( byte observedBase : BaseUtils.BASES ) {
for ( byte refBase : BaseUtils.BASES ) {
s.append(String.format("%s\t%s",(char)observedBase,(char)refBase));
for ( Comparable c : conditions ) {
s.append(String.format("\t%s",c.toString()));
}
s.append(String.format("\t%d%n", table[BaseUtils.simpleBaseToBaseIndex(observedBase)][BaseUtils.simpleBaseToBaseIndex(refBase)]));
}
}
out.print(s.toString());
}
public void update(byte observedBase, byte refBase ) {
//if ( observedBase == refBase ) {
// throw new StingException("BaseTransitionTable received equal observed and reference bases, which should not happen.");
//}
// System.out.println("Table updating: Observed Base: "+observedBase+" Ref base: "+refBase);
table[BaseUtils.simpleBaseToBaseIndex(observedBase)][BaseUtils.simpleBaseToBaseIndex(refBase)]++;
}
public int numConditions() {
return conditions.size();
}
private Comparable getCondition(int offset) {
return conditions.get(offset);
}
private ListIterator getConditionIterator() {
return conditions.listIterator();
}
public void incorporateTable(BaseTransitionTable t) {
for ( int i = 0; i < BaseUtils.BASES.length; i ++ ) {
for ( int j = 0; j < BaseUtils.BASES.length; j ++ ) {
table[i][j] += t.observationsOf(i,j);
}
}
}
public int observationsOf( int observedBaseIndex, int referenceBaseIndex ) {
return table[observedBaseIndex][referenceBaseIndex];
}
}

View File

@ -1,345 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.Utils;
import java.util.*;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class ConcordanceTruthTable {
public static final int TRUE_POSITIVE = 0;
public static final int TRUE_NEGATIVE = 1;
public static final int FALSE_POSITIVE = 2;
public static final int FALSE_NEGATIVE = 3;
public static final int VARIANT = 1;
private static final String[] POOL_HEADERS = {"TP","TN","FP","FN"};
public static final int REF = 0;
public static final int VAR_HET = 1;
public static final int VAR_HOM = 2;
public static final int UNKNOWN = 3;
public static final int NO_CALL = 3; // synonym
private static final String[] TRUTH_NAMES = {"IS_REF", "IS_VAR_HET", "IS_VAR_HOM", "UNKNOWN"};
private static final String[] CALL_NAMES = {"CALLED_REF", "CALLED_VAR_HET", "CALLED_VAR_HOM", "NO_CALL"};
private String name = null;
private boolean singleSampleMode;
private int[][] table;
private int[] truth_totals;
private int[] calls_totals;
public ConcordanceTruthTable(String name) {
// there's a specific sample associated with this truth table
this.name = name;
singleSampleMode = true;
table = new int[4][4];
truth_totals = new int[4];
calls_totals = new int[4];
for (int i = 0; i < 4; i++) {
truth_totals[i] = 0;
calls_totals[i] = 0;
for (int j = 0; j < 4; j++)
table[i][j] = 0;
}
}
public ConcordanceTruthTable(int nSamples) {
// there's no specific sample associated with this truth table
singleSampleMode = false;
name = "pooled_concordance";
truth_totals = new int[4];
calls_totals = new int[4];
for (int i = 0; i < 4; i++) {
truth_totals[i] = 0;
calls_totals[i] = 0;
}
initializeFrequencyTable(nSamples);
}
private void initializeFrequencyTable( int numChips ) {
// System.out.println("Frequency Table for Pooled Concordance initialized with number of chips = "+numChips);
table = new int[numChips*2][4];
for (int i = 0; i < 4; i++) {
for ( int freq = 0; freq < 2*numChips; freq ++ ) {
table[freq][i] = 0;
}
}
// System.out.println("Table Size: "+table.length+" by "+table[1].length);
}
public String addEntry(List<Pair<Genotype, Genotype>> chipEvals, Variation eval, char ref) {
String violation = null;
// if the table represents a single sample, then we can calculate genotype stats
if ( singleSampleMode ) {
for ( Pair<Genotype, Genotype> chipEval : chipEvals ) {
Genotype chipG = chipEval.first;
Genotype evalG = chipEval.second;
if (chipG == null && evalG == null)
continue;
int truthType = getGenotype(chipG, ref);
int callType = getGenotype(evalG, ref);
//System.out.printf("TEST: %d/%d %s vs. %s%n", truthIndex, callIndex, chip, eval);
if ( truthType == VARIANT && callType != VARIANT ) {
violation = String.format("False negative: ref=%c chip=%s call=%s", ref, chipG, evalG);
} else if ( truthType == REF && callType == VARIANT ) {
violation = String.format("False positive: chip=%s call=%s", chipG, evalG);
}
addGenotypeEntry(truthType, callType);
}
} else { // if we cannot associate tables with individuals, then we are working in a pooled context
// first we need to expand our tables to include frequency information
Pair<Integer, Pair<Integer,Integer> > poolVariant = getPooledAlleleFrequency(chipEvals, ref);
int truthType = poolVariant.getFirst(); // convenience method; now to interpret
int callType = getCallIndex(eval,ref);
int numTrueSupportingAlleles = poolVariant.getSecond().getFirst();
if ( numTrueSupportingAlleles > 0 && truthType == VARIANT && callType != VARIANT ) {
violation = String.format("False negative: %s with %d alt alleles", chipEvals.get(0).getFirst(), numTrueSupportingAlleles);
} else if ( truthType == REF && callType == VARIANT ) {
violation = String.format("False positive: %s at hom-ref site", eval);
}
addFrequencyEntry( truthType, callType, poolVariant.getSecond().getFirst() );
}
// TODO -- implement me for pooled mode with frequency stats
// TODO -- You'll want to use eval and the chips from chipEvals (these are the first members of the pair)
// TODO -- You'll also need to declare (and initialize) the relevant data arrays for the data
// TODO -- Indexes like TRUE_POSITIVE are defined above for you
return violation;
}
public Pair<Integer, Pair<Integer,Integer>> getPooledAlleleFrequency( List<Pair<Genotype,Genotype>> chips, char ref) {
// this is actually just a note that I wanted to appear in blue. This method explicitly uses
// the assumption that tri-allelic sites do not really exist, and that if they do the
// site will be marked as such by an 'N' in the reference, so we will not get to this point.
int frequency = 0;
int nChips = 0;
if ( chips != null ) {
for ( Pair<Genotype,Genotype> chip : chips ) {
Genotype c = chip.getFirst();
if ( c != null ) {
nChips++;
if ( c.isVariant(ref) ) {
if ( c.isHet() ) {
frequency++;
} else { // c is hom
frequency += 2;
}
}
//System.out.printf(" Genotype %s at %c => %d%n", c, ref, frequency);
}
}
//System.out.printf("*** %d%n", frequency);
}
int truthType = nChips > 0 ? ( frequency > 0 ? VARIANT : REF ) : NO_CALL;
return new Pair<Integer, Pair<Integer,Integer> >(truthType, new Pair<Integer,Integer>(frequency,nChips));
}
private void addFrequencyEntry( int truthIndex, int callIndex, int numTrueSupportingAlleles ) {
//System.out.printf(" %s %s %d%n", CALL_NAMES[truthIndex], CALL_NAMES[callIndex], numTrueSupportingAlleles);
calls_totals[callIndex]++;
truth_totals[truthIndex]++;
if ( truthIndex == REF && ( callIndex == REF || callIndex == NO_CALL ) ) {
// true negative
table[numTrueSupportingAlleles][TRUE_NEGATIVE]++;
// sanity check - there should never be an entry in
// [*][TRUE_NEGATIVE] for * > 0
} else if ( truthIndex == REF && callIndex == VARIANT ) {
// false positive
table[numTrueSupportingAlleles][FALSE_POSITIVE]++;
} else if ( truthIndex == VARIANT && (callIndex == NO_CALL || callIndex == REF) ) {
// false negative
table[numTrueSupportingAlleles][FALSE_NEGATIVE]++;
} else if ( truthIndex == VARIANT && callIndex == VARIANT ) {
// true positive
table[numTrueSupportingAlleles][TRUE_POSITIVE]++;
} else {
// something else is going on; wonky site or something. Don't do anything to the table.
}
}
private static int getCallIndex(Variation eval, char ref) {
int index;
if ( eval == null ) {
index = NO_CALL;
} else if ( ! eval.isSNP() ) {
index = REF;
} else {
index = VARIANT;
}
return index;
}
private static int getGenotype(Genotype g, char ref) {
int type;
if ( g == null )
type = NO_CALL;
else if ( !g.isVariant(ref) )
type = REF;
else if ( g.isHet() )
type = VAR_HET;
else
type = VAR_HOM;
return type;
}
private void addGenotypeEntry(int truthIndex, int callIndex) {
table[truthIndex][callIndex]++;
truth_totals[truthIndex]++;
calls_totals[callIndex]++;
}
public void addAllStats(List<String> s) {
if ( singleSampleMode )
addGenotypeStats(s);
else
addFrequencyStats(s);
}
// private void addFrequencyStats(List<String> s) {
//
// // TODO -- implement me for pooled mode with frequency stats
// s.add(String.format("name %s",name));
// s.add(String.format("TRUTH_ALLELE_FREQUENCY\tERROR_OR_TRUTH_TYPE\tTOTAL\tAS_PRCT_OF_TOTAL_CALLS\tAS_PRCT_OF_CALLS_AT_FREQUENCY"));
//
// for ( int af = 0; af < table.length; af ++ ) {
// for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) {
// StringBuffer sb = new StringBuffer();
// sb.append(String.format("%f ", ((double) af)/ table.length));
// sb.append(String.format("%s ",POOL_HEADERS[errorIndex]));
// sb.append(String.format("%d ", table[af][errorIndex]));
// sb.append(String.format("%s ", percentOfTotal(table,af,errorIndex)));
// sb.append(String.format("%s ", marginalPercent(table[af],errorIndex)));
// s.add(sb.toString());
// }
// }
//
// }
private void addFrequencyStats(List<String> s) {
s.add(String.format("name %s",name));
s.add("TRUTH_ALLELE_COUNT\tTRUTH_ALLELE_FREQ\tTOTAL\t" + Utils.join(" ", POOL_HEADERS));
for ( int af = 0; af < table.length; af ++ ) {
int sum = 0;
String counts = "";
for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) {
int count = table[af][errorIndex];
sum += count;
counts += String.format(" %6d", count);
}
s.add(String.format("%6d %.3f %6d%s", af, ((double)af)/ table.length, sum, counts));
}
}
private void addGenotypeStats(List<String> s) {
s.add(String.format("name %s", name));
s.add(String.format("TRUTH_STATE\tCALLED_REF\tCALLED_VAR_HET\tCALLED_VAR_HOM\tNO_CALL\t\tTOTALS\tTRUE_GENOTYPE_CONCORDANCE\tGENOTYPE_SENSITIVITY"));
for (int i = 0; i < 4; i++) {
StringBuffer sb = new StringBuffer();
sb.append(String.format("%15s ", TRUTH_NAMES[i]));
for (int j = 0; j < 4; j++)
sb.append(String.format("%9d ", table[i][j]));
sb.append(String.format("%9d ", truth_totals[i]));
if (i == VAR_HET || i == VAR_HOM) {
sb.append(String.format("\t%s\t\t", cellPercent(table[i][i], table[i][REF] + table[i][VAR_HET] + table[i][VAR_HOM])));
sb.append(String.format("%s", cellPercent(truth_totals[i] - table[i][NO_CALL], truth_totals[i])));
} else {
sb.append("\tN/A\t\t\tN/A");
}
s.add(sb.toString());
}
addCalledGenotypeConcordance(s);
addOverallStats(s);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
s.add(String.format("%s_%s_%s %d", TRUTH_NAMES[i], CALL_NAMES[j], "NO_SITES", table[i][j]));
s.add(String.format("%s_%s_%s %s", TRUTH_NAMES[i], CALL_NAMES[j], "PERCENT_OF_TRUTH", cellPercent(table[i][j], truth_totals[i])));
s.add(String.format("%s_%s_%s %s", TRUTH_NAMES[i], CALL_NAMES[j], "PERCENT_OF_CALLS", cellPercent(table[i][j], calls_totals[j])));
}
if (i == VAR_HET || i == VAR_HOM) {
s.add(String.format("%s_%s %s", TRUTH_NAMES[i], "TRUE_GENOTYPE_CONCORDANCE", cellPercent(table[i][i], table[i][REF] + table[i][VAR_HET] + table[i][VAR_HOM])));
s.add(String.format("%s_%s %s", TRUTH_NAMES[i], "GENOTYPE_SENSITIVITY", cellPercent(truth_totals[i] - table[i][NO_CALL], truth_totals[i])));
}
}
}
private void addCalledGenotypeConcordance(List<String> s) {
StringBuilder sb = new StringBuilder();
sb.append("CALLED_GENOTYPE_CONCORDANCE\t");
for (int i = 0; i < 4; i++) {
int nConcordantCallsI = table[i][i];
String value = "N/A";
if (i != UNKNOWN)
value = String.format("%s\t", cellPercent(nConcordantCallsI, calls_totals[i] - table[UNKNOWN][i]));
sb.append(value);
}
s.add(sb.toString());
}
// How many overall calls where made that aren't NO_CALLS or UNKNOWNS?
private int getNCalled() {
int n = 0;
for (int i = 0; i < 4; i++)
for (int j = 0; j < 4; j++)
if (i != NO_CALL && j != NO_CALL) n += table[i][j];
return n;
}
private void addOverallStats(List<String> s) {
int nConcordantRefCalls = table[REF][REF];
int nConcordantHetCalls = table[VAR_HET][VAR_HET];
int nConcordantVarHomCalls = table[VAR_HOM][VAR_HOM];
int nVarCalls = table[VAR_HOM][VAR_HET] + table[VAR_HOM][VAR_HOM] + table[VAR_HET][VAR_HET] + table[VAR_HET][VAR_HOM];
int nConcordantVarCalls = nConcordantHetCalls + nConcordantVarHomCalls;
int nConcordantCalls = nConcordantRefCalls + nConcordantVarCalls;
int nTrueVar = truth_totals[VAR_HET] + truth_totals[VAR_HOM];
int nCalled = getNCalled();
s.add(String.format("VARIANT_SENSITIVITY %s", cellPercent(nVarCalls, nTrueVar)));
s.add(String.format("VARIANT_CONCORDANCE %s", cellPercent(nConcordantVarCalls, nVarCalls)));
s.add(String.format("OVERALL_CONCORDANCE %s", cellPercent(nConcordantCalls, nCalled)));
}
private static String cellPercent(int count, int total) {
StringBuffer sb = new StringBuffer();
total = Math.max(total, 0);
sb.append(String.format("%.2f", (100.0 * count) / total));
sb.append("%");
return sb.toString();
}
}

View File

@ -1,167 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import net.sf.samtools.SAMRecord;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Dec 3, 2009
* Time: 11:54:35 AM
* To change this template use File | Settings | File Templates.
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class DSBWalker extends LocusWalker<Integer,Integer> {
@Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true)
int COV_CUTOFF = 0;
@Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false)
int MINLENGTH_CUTOFF = 0;
private int MERGE_DIST = 300; // merge intervals that are closer than this distance from one another
private long maxcov = 0;
private long maxz = 0;
private long mergedmaxcov = 0;
private long mergedmaxz = 0;
GenomeLoc mergedInterval = null;
GenomeLoc currentInterval = null;
private long nIntervals = 0;
private void emit(GenomeLoc l) {
if ( mergedInterval == null ) {
mergedInterval = l.clone();
mergedmaxcov = maxcov;
mergedmaxz = maxz;
return;
}
if ( mergedInterval.getContigIndex() != l.getContigIndex() ) {
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
if ( length >= MINLENGTH_CUTOFF ) {
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
nIntervals++;
}
mergedInterval = l.clone();
mergedmaxcov = maxcov;
mergedmaxz = maxz;
return;
}
// merged interval exists and new interval is on the same contig. Check if the new interval
// is close enough so we got to merge and keep waiting:
if ( l.getStart() - mergedInterval.getStop() < MERGE_DIST ) {
mergedInterval = GenomeLocParser.setStop(mergedInterval,l.getStop());
if ( maxcov > mergedmaxcov) mergedmaxcov = maxcov;
if ( maxz > mergedmaxz ) mergedmaxz = maxz;
return;
}
// nope, new interval is far enough. Print old one and keep current one.
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
if ( length >= MINLENGTH_CUTOFF ) {
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
nIntervals++;
}
mergedInterval = l.clone();
mergedmaxcov = maxcov;
mergedmaxz = maxz;
}
public void onTraversalDone() {
if ( mergedInterval != null ) {
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
if ( length >= MINLENGTH_CUTOFF ) {
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
nIntervals++;
}
}
System.out.println(nIntervals+" intervals detected.");
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
ReadBackedPileup pileup = context.getPileup();
List<SAMRecord> reads = pileup.getReads();
int nZero = pileup.getNumberOfMappingQualityZeroReads();
int nonZCoverage = reads.size() - nZero;
if ( nonZCoverage >= COV_CUTOFF ) {
// if we were not inside an interval, start one:
if ( currentInterval == null ) {
maxcov = nonZCoverage;
maxz = nZero;
currentInterval = context.getLocation().clone();
// System.out.println("Setting current to "+currentInterval);
return 0;
}
// if we were inside an interval and we just jumped onto a new contig, get rid of the old interval
if ( currentInterval.compareContigs(context.getLocation()) != 0 ) {
// we just moved to a new contig
System.out.println("On contig "+context.getLocation().getContig());
emit(currentInterval);
maxcov = nonZCoverage;
maxz = nZero;
currentInterval = context.getLocation().clone();
return 0;
}
// we are on the same contig, we are within the interval, so we need to extend the current interval:
currentInterval = GenomeLocParser.setStop(currentInterval,context.getLocation().getStop()); // still within the interval, adjust stop
//System.out.println("Extending current to "+currentInterval +" ("+context.getLocation()+", "+context.getLocation().getStop()+")");
if ( nonZCoverage > maxcov ) maxcov = nonZCoverage; // adjust maxcov
if ( nZero > maxz ) maxz = nZero; // adjust maxz
} else {
// low coverage, if we were inside an interval, it stops now:
if ( currentInterval != null ) {
// System.out.println("Emitting current as "+currentInterval);
emit(currentInterval);
currentInterval = null;
maxcov = 0;
maxz = 0;
}
}
return 0;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
public Integer reduceInit() {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
public Integer reduce(Integer value, Integer sum) {
return sum+value; //To change body of implemented methods use File | Settings | File Templates.
}
}

View File

@ -1,360 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.CircularArray;
import org.broadinstitute.sting.utils.PrimitivePair;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.Set;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Dec 12, 2009
* Time: 2:25:44 PM
* To change this template use File | Settings | File Templates.
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class DSBWalkerV2 extends LocusWalker<Integer,Integer> {
// @Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true)
// int COV_CUTOFF = 0;
// @Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false)
// int MINLENGTH_CUTOFF = 0;
@Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true)
int WINDOW_SIZE = 100;
@Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true)
double ENRICHMENT_CUTOFF = 5.0;
@Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+
"(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+
"so that control*enrichmentCutoff is too low to be convincing)",required=true)
int MIN_SIGNAL = 10;
private CircularArray<PrimitivePair.Int> signalWindow = null;
private CircularArray<PrimitivePair.Int> controlWindow = null;
private CircularArray<PrimitivePair.Int> signalStrandsWindow = null;
private CircularArray<PrimitivePair.Int> controlStrandsWindow = null;
private PrimitivePair.Long totalSignalCoverage = new PrimitivePair.Long();
private PrimitivePair.Long totalControlCoverage = new PrimitivePair.Long();
private PrimitivePair.Long totalSignalFwdStrands = new PrimitivePair.Long();
private PrimitivePair.Long totalControlFwdStrands = new PrimitivePair.Long();
private Set<String> signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able
private Set<String> controlReadGroups ; // to properly assign the reads coming from a merged stream
private long windowStart = -1;
private long windowStop = -1;
private int curContig = -1;
private String curContigName = "";
// the following variables are for buffering and merging windows :
private long regionStart = -1;
private long lastWindowStart = -1;
private PrimitivePair.Int maxSignalReads = new PrimitivePair.Int();
private PrimitivePair.Int minSignalReads = new PrimitivePair.Int();
private PrimitivePair.Int maxControlReads = new PrimitivePair.Int();
private PrimitivePair.Int minControlReads = new PrimitivePair.Int();
private double minEnrichmentUnique;
private double maxEnrichmentUnique;
private double minEnrichmentNonUnique;
private double maxEnrichmentNonUnique;
private double minEnrichmentTotal;
private double maxEnrichmentTotal;
private double minUniqueSignalStrandBalance = 0.0;
private double maxUniqueSignalStrandBalance = 0.0;
private double minNonUniqueSignalStrandBalance = 0.0;
private double maxNonUniqueSignalStrandBalance = 0.0;
private double minUniqueControlStrandBalance = 0.0;
private double maxUniqueControlStrandBalance = 0.0;
private double minNonUniqueControlStrandBalance = 0.0;
private double maxNonUniqueControlStrandBalance = 0.0;
@Override
public void initialize() {
int nSams = getToolkit().getArguments().samFiles.size();
if ( nSams != 2 ) {
out.println("ERROR: two input bam files (signal and backround control) must be specified");
System.exit(1);
}
List<Set<String>> readGroupSets = getToolkit().getMergedReadGroupsByReaders();
signalReadGroups = readGroupSets.get(0);
// System.out.println(signalReadGroups.size()+" read groups in signal");
controlReadGroups = readGroupSets.get(1);
// System.out.println(controlReadGroups.size()+" read groups in control");
signalWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
controlWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
signalStrandsWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
controlStrandsWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
ReadBackedPileup pileup = context.getPileup();
List<SAMRecord> reads = pileup.getReads();
// compute coverages at the current site:
PrimitivePair.Int signalCov = new PrimitivePair.Int();
PrimitivePair.Int controlCov = new PrimitivePair.Int();
PrimitivePair.Int signalFwdStrands = new PrimitivePair.Int();
PrimitivePair.Int controlFwdStrands = new PrimitivePair.Int();
for ( SAMRecord r : reads ) {
if ( signalReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) {
if ( r.getMappingQuality() == 0 ) {
signalCov.second++;
if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.second++;
}
else {
signalCov.first++;
if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.first++;
}
} else {
if ( controlReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) {
if ( r.getMappingQuality() == 0 ) {
controlCov.second++;
if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.second++;
}
else {
controlCov.first++;
if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.first++;
}
} else {
throw new StingException("Read "+r+" belongs to unknown read group ("+r.getReadGroup()+")");
}
}
}
GenomeLoc loc = context.getLocation();
// if ( curContig != 0 ) System.out.println(loc+" "+signalCov.first+" "+signalCov.second+" "+controlCov.first+" "+controlCov.second);
if ( loc.getContigIndex() != curContig || loc.getStart() >= windowStop+WINDOW_SIZE ) {
// we jumped to the next contig, or we are on the same contig but the current position is
// more than WINDOW_SIZE away from the current window's end (i.e. there's nothing to shift)
checkCurrentWindow(true);
if ( loc.getContigIndex() != curContig ) {
System.out.println("on contig "+loc.getContig());
}
curContig = loc.getContigIndex();
curContigName = loc.getContig();
// prevPos = loc.getStart();
windowStart = loc.getStart();
windowStop = windowStart + WINDOW_SIZE - 1;
signalWindow.clear();
controlWindow.clear();
totalSignalCoverage.assignFrom( signalCov );
totalControlCoverage.assignFrom( controlCov );
totalSignalFwdStrands.assignFrom( signalFwdStrands );
totalControlFwdStrands.assignFrom( controlFwdStrands );
signalWindow.set(0,signalCov);
controlWindow.set(0,controlCov);
signalStrandsWindow.set(0,signalFwdStrands);
controlStrandsWindow.set(0,controlFwdStrands);
return 1;
}
// offset of the current position w.r.t. the start of the window:
int offset = (int)(loc.getStart() - windowStart);
if ( offset >= WINDOW_SIZE ) {
// if we are here, the current position is outside of the current window, but not
// far enough so that we'd need to reinitialize the window from scratch (that was already checked above).
// Now we need to shift.
// We are receiving covered positions in order, so we are guaranteed that everything prior to
// the current position was already counted; if some elements of the windows are still nulls, it means
// there was no coverage there
int shift = offset - WINDOW_SIZE + 1;
// scroll the window(s) base by base until the current position is inside the window. At each step
// we will check if the window meets the requirements and should be printed out.
for ( int i = 0 ; i < shift ; i++ ) {
// we are going to shift; check if the window as it is now is worth printing
checkCurrentWindow(false);
// discard coverage from the first element of the window (this element is about to be shifted out of scope)
if ( signalWindow.get(0) != null ) totalSignalCoverage.subtract(signalWindow.get(0));
if ( signalStrandsWindow.get(0) != null ) totalSignalFwdStrands.subtract(signalStrandsWindow.get(0));
if ( controlWindow.get(0) != null ) totalControlCoverage.subtract(controlWindow.get(0));
if ( controlStrandsWindow.get(0) != null ) totalControlFwdStrands.subtract(controlStrandsWindow.get(0));
// advnace window coordinates on the ref
windowStart++;
windowStop++;
// shift the data in the window(s):
signalWindow.shiftData(1);
controlWindow.shiftData(1);
signalStrandsWindow.shiftData(1);
controlStrandsWindow.shiftData(1);
offset--; // this is the new offset w.r.t. to the shifted window
}
}
// at this point, either the current position was inside the current window, or it was outside,
// but the window was already shifted
totalSignalCoverage.add(signalCov);
totalControlCoverage.add(controlCov);
totalSignalFwdStrands.add(signalFwdStrands);
totalControlFwdStrands.add(controlFwdStrands);
signalWindow.set(offset,signalCov);
controlWindow.set(offset,controlCov);
signalStrandsWindow.set(offset,signalFwdStrands);
controlStrandsWindow.set(offset,controlFwdStrands);
return 1;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
public Integer reduceInit() {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
public Integer reduce(Integer value, Integer sum) {
return sum+value; //To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void onTraversalDone(Integer result) {
printRegion();
super.onTraversalDone(result);
}
/** Checks if the currently held window satisfies the conditions set up for significance, and invokes buffered printout if so.
* If the parameter is set to true, printout of previously held region is forced, and the buffer is reinitialized with
* the new window if it passes the cutoffs, or left empty.
*
*/
private void checkCurrentWindow(boolean force) {
if ( force ) printRegion();
if ( signalWindow.get(0) == null && controlWindow.get(0) == null ) return; // do not emit windows that start from empty cell; we will get them later
if ( totalControlCoverage.first * ENRICHMENT_CUTOFF / 36.0 < MIN_SIGNAL ) { // control coverage zero or too low
if ( totalSignalCoverage.first /28.0 > MIN_SIGNAL ) emitWindow(false); // require at least MIN_SIGNAL coverage for signal
return;
}
// if we have decent coverage in control, just check for required enrichment in the signal
if ( ((double)totalSignalCoverage.first/28.0) / (totalControlCoverage.first/36.0) > ENRICHMENT_CUTOFF ) emitWindow(false);
}
/** This is actually a delayed print command: it buffers the successive windows set for printout, merges the windows that
* are close enough and prints only when a train of close-by windows has ended and next window received is far enough
*/
private void emitWindow(boolean force) {
if ( regionStart == -1 ) {
resetBuffer();
return;
}
if ( force || windowStart > lastWindowStart + WINDOW_SIZE ) {
// new window is far enough from the region we were buffering: emit old region
printRegion();
resetBuffer();
return;
}
// current window is too close (overlapping) with a previous one: we need to merge
lastWindowStart = windowStart;
maxSignalReads.first = Math.max(maxSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0));
maxSignalReads.second = Math.max(maxSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0));
minSignalReads.first = Math.min(minSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0));
minSignalReads.second = Math.min(minSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0));
maxControlReads.first = Math.max(maxControlReads.first,(int)Math.round(totalControlCoverage.first/36.0));
maxControlReads.second = Math.max(maxControlReads.second,(int)Math.round(totalControlCoverage.second/36.0));
minControlReads.first = Math.min(minControlReads.first,(int)Math.round(totalControlCoverage.first/36.0));
minControlReads.second = Math.min(minControlReads.second,(int)Math.round(totalControlCoverage.second/36.0));
maxEnrichmentUnique = Math.max(maxEnrichmentUnique,((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0));
minEnrichmentUnique = Math.min(minEnrichmentUnique, ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0));
maxEnrichmentNonUnique = Math.max(maxEnrichmentNonUnique,((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0));
minEnrichmentNonUnique = Math.min( minEnrichmentNonUnique, ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0) );
maxEnrichmentTotal = Math.max( maxEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
((totalControlCoverage.first+ totalControlCoverage.second)/36.0) );
minEnrichmentTotal = Math.min( minEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
((totalControlCoverage.first+ totalControlCoverage.second)/36.0) );
maxUniqueSignalStrandBalance = Math.max(maxUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first);
minUniqueSignalStrandBalance = Math.min(minUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first);
maxNonUniqueSignalStrandBalance = Math.max(maxNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second);
minNonUniqueSignalStrandBalance = Math.min(minNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second);
maxUniqueControlStrandBalance = Math.max(maxUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first);
minUniqueControlStrandBalance = Math.min(minUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first);
maxNonUniqueControlStrandBalance = Math.max(maxNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second);
minNonUniqueControlStrandBalance = Math.min(minNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second);
}
private void resetBuffer() {
regionStart = windowStart;
lastWindowStart = windowStart;
maxSignalReads.first = (int)Math.round(totalSignalCoverage.first/28.0);
maxSignalReads.second = (int)Math.round(totalSignalCoverage.second/28.0);
minSignalReads.assignFrom(maxSignalReads);
maxControlReads.first = (int)Math.round(totalControlCoverage.first/36.0);
maxControlReads.second = (int)Math.round(totalControlCoverage.second/36.0);
minControlReads.assignFrom(maxControlReads);
minEnrichmentUnique = maxEnrichmentUnique = ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0);
minEnrichmentNonUnique = maxEnrichmentNonUnique = ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0);
minEnrichmentTotal = maxEnrichmentTotal = ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
((totalControlCoverage.first+ totalControlCoverage.second)/36.0);
minUniqueSignalStrandBalance = maxUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.first)/totalSignalCoverage.first;
minNonUniqueSignalStrandBalance = maxNonUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.second)/totalSignalCoverage.second;
minUniqueControlStrandBalance = maxUniqueControlStrandBalance = ((double)totalControlFwdStrands.first)/totalControlCoverage.first;
minNonUniqueControlStrandBalance = maxNonUniqueControlStrandBalance = ((double)totalControlFwdStrands.second)/totalControlCoverage.second;
}
private void printRegion() {
if ( regionStart == -1 ) return;
out.print(curContigName+":"+regionStart+"-"+windowStop+"\t"+(windowStop-regionStart+1) +"\t"+
minSignalReads.first+"-"+maxSignalReads.first+"\t"+
minSignalReads.second+"-"+maxSignalReads.second+"\t"+
minControlReads.first+"-"+maxControlReads.first+"\t"+
minControlReads.second+"-"+maxControlReads.second+"\t");
out.printf("%.2f-%.2f\t",minEnrichmentUnique,maxEnrichmentUnique);
out.printf("%.2f-%.2f\t",minEnrichmentNonUnique,maxEnrichmentNonUnique);
out.printf("%.2f-%.2f\t",minEnrichmentTotal,maxEnrichmentTotal);
out.printf("%.2f-%.2f\t",minUniqueSignalStrandBalance,maxUniqueSignalStrandBalance);
out.printf("%.2f-%.2f\t",minNonUniqueSignalStrandBalance,maxNonUniqueSignalStrandBalance);
out.printf("%.2f-%.2f\t",minUniqueControlStrandBalance,maxUniqueControlStrandBalance);
out.printf("%.2f-%.2f",minNonUniqueControlStrandBalance,maxNonUniqueControlStrandBalance);
if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER");
out.println();
regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty
}
}

View File

@ -1,244 +0,0 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.walkers;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.duplicates.DupUtils;
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
class MismatchCounter {
long nObs = 0;
long nMismatches = 0;
public void inc(long incNObs, long incNMismatches) {
nObs += incNObs;
nMismatches += incNMismatches;
}
public void inc(boolean mismatchP) {
inc(1, mismatchP ? 1 : 0);
}
public double mismatchRate() {
return (double)nMismatches / nObs;
}
public byte empiricalQualScore() {
return QualityUtils.probToQual(1 - mismatchRate(), 0);
}
public String headerString() {
return "mismatchRate\tempiricalQ\tnObs\tnMismatches";
}
public String toString() {
return String.format("%.10f\t%d\t%d\t%6d", mismatchRate(), empiricalQualScore(), nObs, nMismatches);
}
}
class QualityTracker {
final private int MAX_QUAL_SCORE = 100;
MismatchCounter[][] mismatchesByQ = new MismatchCounter[MAX_QUAL_SCORE][MAX_QUAL_SCORE];
public QualityTracker() {
for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
mismatchesByQ[i][j] = new MismatchCounter();
}
}
}
public void inc(int b1Qi, int b2Qi, boolean mismatchP, boolean orderDependent) {
int b1Q = orderDependent ? b1Qi : Math.max(b1Qi, b2Qi);
int b2Q = orderDependent ? b2Qi : Math.min(b1Qi, b2Qi);
if ( b1Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b1Q);
if ( b2Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b2Q);
mismatchesByQ[b1Q][b2Q].inc(mismatchP);
}
public void inc(DuplicateComp dc, boolean orderDependent) {
inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP(), orderDependent);
}
public int probMismatchQ1Q2(int q1, int q2) {
double e1 = 1 - QualityUtils.qualToProb(q1);
double e2 = 1 - QualityUtils.qualToProb(q2);
double eMM = e1 * (1 - e2) + (1 - e1) * e2 - 1/3 * e1 * e2;
return QualityUtils.probToQual(1 - eMM, 0.0);
}
public void printToStream(PrintStream out, boolean filterUnobserved) {
out.printf("Q1\tQ2\tQmin\t%s%n", mismatchesByQ[0][0].headerString());
for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
MismatchCounter mc = mismatchesByQ[i][j];
//System.out.printf("MC = %s%n", mc);
if ( filterUnobserved && mc.nObs == 0 )
continue;
out.printf("%d\t%d\t%d\t%s\t%n", i, j, probMismatchQ1Q2(i,j), mc.toString());
}
}
}
}
public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, QualityTracker> {
@Argument(fullName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data")
public boolean FILTER_UNOBSERVED_QUALS = false;
@Argument(fullName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets")
public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET = 100;
@Argument(fullName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
public boolean COMBINE_QUALS = false;
@Argument(fullName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities")
public boolean COMBINE_ALL_DUPS = false;
@Argument(fullName="orderDependent", required=false, doc="")
public boolean orderDependent = false;
@Argument(fullName="compareToUniqueReads", required=false, doc="If true, then we will compare only to unique (i.e., non-duplicated molecules) at the same duplicate site")
public boolean compareToUniqueReads = false;
@Argument(fullName="comparePairToSingleton", required=false, doc="If true, then we will compare a combined dup to a random other read in the duplicate set, not a combined pair itself")
public boolean comparePairToSingleton = false;
final boolean DEBUG = false;
final private boolean ACTUALLY_DO_WORK = true;
public void onTraversalDone(QualityTracker result) {
result.printToStream(out, FILTER_UNOBSERVED_QUALS);
}
public QualityTracker reduceInit() {
return new QualityTracker();
}
public QualityTracker reduce(List<DuplicateComp> dupComps, QualityTracker tracker) {
for ( DuplicateComp dc : dupComps ) {
tracker.inc(dc, orderDependent);
}
return tracker;
}
// Print out data for regression
public List<DuplicateComp> map(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>();
// todo -- fixme -- the logic here is all wrong given new interface
// if ( ! ACTUALLY_DO_WORK )
// return pairwiseComps;
//
// if ( COMBINE_QUALS ) {
// Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
// if ( combinedReads != null ) {
// SAMRecord combined1 = combinedReads.first;
// SAMRecord combined2 = combinedReads.second;
//
// if ( comparePairToSingleton )
// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
// else
// pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
// }
// } else {
// int nComparisons = 0;
// for ( SAMRecord read1 : duplicateReads ) {
// for ( SAMRecord read2 : duplicateReads ) {
// if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
// // the hashcode insures we don't do A vs. B and B vs. A
// //System.out.printf("Comparing %s against %s%n", read1, read2);
// nComparisons++;
// pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
// if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
// break;
// }
// }
// }
// }
return pairwiseComps;
}
private List<DuplicateComp> addPairwiseMatches(List<DuplicateComp> comps,
SAMRecord read1, SAMRecord read2,
List<SAMRecord> uniqueReads ) {
if ( compareToUniqueReads ) {
// we want to compare to a read in the unique read set
if ( uniqueReads.size() > 0 ) { // there's actually something to compare to
SAMRecord uniqueRead = uniqueReads.get(0); // might as well get the first one
return pairwiseMatches(comps, read1, uniqueRead);
} else {
return comps;
}
} else {
// default, just do read1 vs. read2
return pairwiseMatches(comps, read1, read2);
}
}
/**
* Calculates the pairwise mismatches between reads read1 and read2 and adds the result to the comps list.
* Doesn't contain any logic deciding what to compare, just does read1 and read2
*
* @param comps
* @param read1
* @param read2
* @return
*/
private List<DuplicateComp> pairwiseMatches(List<DuplicateComp> comps, SAMRecord read1, SAMRecord read2 ) {
byte[] read1Bases = read1.getReadBases();
byte[] read1Quals = read1.getBaseQualities();
byte[] read2Bases = read2.getReadBases();
byte[] read2Quals = read2.getBaseQualities();
for ( int i = 0; i < read1Bases.length; i++) {
byte qual1 = read1Quals[i];
byte qual2 = read2Quals[i];
boolean mismatchP = ! BaseUtils.basesAreEqual(read1Bases[i], read2Bases[i]);
DuplicateComp dc = new DuplicateComp(qual1, qual2, mismatchP);
comps.add(dc);
}
return comps;
}
}

View File

@ -1,193 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.playground.gatk.walkers.poolseq.PowerBelowFrequencyWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.*;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Nov 12, 2009
* Time: 12:31:58 PM
* To change this template use File | Settings | File Templates.
*/
public class HapmapPoolAllelicInfoWalker extends LocusWalker<String, PrintWriter> {
@Argument(fullName="outputFile", shortName="of", doc="File to write to", required=true)
public String outputFileString = null;
@Argument(fullName="numIndividualsInPool", shortName="ps",doc="Pool size",required = true)
public int poolSize = -1;
@Argument(fullName="sampleNames", shortName="samples", doc="Sample name bindings", required=true)
public String sampleNameFile = null;
@Argument(fullName="minCallQuality", shortName="q", doc="Ignore calls with below this quality, defaults to -1")
public double minCallQ = -1;
private PrintWriter output;
private static double EPSILON = Math.pow(10,-4);
private String[] sampleNames = null;
private PowerBelowFrequencyWalker powerWalker = null;
private ConcordanceTruthTable ctt = null;
public void initialize() {
sampleNames = generateNameTableFromFile(sampleNameFile);
powerWalker = new PowerBelowFrequencyWalker();
powerWalker.initialize();
powerWalker.setPoolSize(poolSize);
ctt = new ConcordanceTruthTable(poolSize);
}
public PrintWriter reduceInit() {
try {
output = new PrintWriter(outputFileString);
} catch (FileNotFoundException e) {
throw new StingException("File "+outputFileString+" could not be opened.", e);
}
output.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Num_Chips","Depth","Power","Support","Called");
//System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n","Chrom","Pos","Ref","Var","Num_Alleles","Depth","Power","Support","Called");
return output;
}
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
GenomeLoc loc = context.getLocation();
String chrom = loc.getContig();
long pos = loc.getStart();
char refBase = Character.toUpperCase(ref.getBase());
List<Pair<Genotype, Genotype>> chips = getChips(sampleNames, tracker);
Pair<Integer,Pair<Integer,Integer>> alleleFreqInfo = ctt.getPooledAlleleFrequency(chips,refBase);
char alternate;
if ( alleleFreqInfo.first == ConcordanceTruthTable.VARIANT ) {
//System.out.println(refBase + " " + alleleFreqInfo.getFirst().getBases());
alternate = getAlternateBase(chips,refBase);
} else {
return null; // early return
}
int numVariantAllele = alleleFreqInfo.getSecond().getFirst();
int numChipsObserved = alleleFreqInfo.getSecond().getSecond();
int depth = context.size();
double power = powerWalker.calculatePowerAtFrequency(context,numVariantAllele);
int called;
Variation call = tracker.lookup("calls",Variation.class);
if ( call == null ) {
called = 0;
} else if ( call.isReference() || call.getNegLog10PError() < minCallQ-EPSILON ) {
called = 0;
} else {
called = 1;
}
ReadBackedPileup p = context.getPileup();
int support = p.getBaseCounts()[BaseUtils.simpleBaseToBaseIndex(alternate)];
// sanity check
if ( refBase == alternate ) {
if ( alleleFreqInfo.first == ConcordanceTruthTable.VARIANT ) {
;//logger.warn("Called as a variant! Ref: "+ refBase +"Chip data: " + alleleFreqInfo.getFirst().getBases());
}
}
return String.format("%s\t%d\t%c\t%c\t%d\t%d\t%d\t%f\t%d\t%d",chrom,pos,refBase,alternate,numVariantAllele,numChipsObserved,depth,power,support,called);
}
public char getAlternateBase(List<Pair<Genotype, Genotype>> chips, char ref) {
for ( Pair<Genotype, Genotype> chip : chips ) {
Genotype g = chip.first;
char[] bases = g.getBases().toCharArray();
if ( Character.toUpperCase(bases[0]) != ref )
return bases[0];
if ( Character.toUpperCase(bases[1]) != ref )
return bases[1];
}
return ref;
}
public PrintWriter reduce(String s, PrintWriter p) {
if ( s == null ) {
// do nothing
return p;
} else {
//System.out.printf("%s%n",s);
output.printf("%s%n",s);
return p;
}
}
public void onTraversalDone(PrintWriter p) {
output.close();
}
private List<Pair<Genotype,Genotype>> getChips(String[] rodNames, RefMetaDataTracker tracker) {
List<Pair<Genotype, Genotype>> chips = new ArrayList <Pair<Genotype,Genotype>>(rodNames.length);
for ( String name : rodNames ) {
List<Object> rods = tracker.getReferenceMetaData(name);
Variation chip = (rods.size() == 0 ? null : (Variation)rods.get(0));
if ( chip != null ) {
// chips must be Genotypes
if ( !(chip instanceof VariantBackedByGenotype) )
throw new StingException("Failure: trying to analyze genotypes using non-genotype truth data");
chips.add(new Pair<Genotype,Genotype>(((VariantBackedByGenotype)chip).getCalledGenotype(),null));
}
}
return chips;
}
// private methods for reading in names from a file
private String[] generateNameTableFromFile(String file) {
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(file));
} catch( FileNotFoundException e) {
String errMsg = "Hapmap pool file at "+file+" was not found. Please check filepath.";
throw new StingException(errMsg, e);
}
LinkedList<String> nameList = new LinkedList<String>();
while(continueReading(reader)) {
String line = readLine(reader);
nameList.add(line);
}
return nameList.toArray(new String[nameList.size()]);
}
private boolean continueReading(BufferedReader reader) {
boolean continueReading;
try {
continueReading = reader.ready();
} catch(IOException e) {
continueReading = false;
}
return continueReading;
}
private String readLine(BufferedReader reader) {
String line;
try {
line = reader.readLine();
} catch( IOException e) {
String errMsg = "BufferedReader pointing to "+reader.toString()+" was declared ready but no line could be read from it.";
throw new StingException(errMsg,e);
}
return line;
}
}

View File

@ -1,37 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broad.tribble.vcf.VCFHeaderLineType;
import org.broad.tribble.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.Arrays;
public class QualityAdjustedSecondBaseLod implements InfoFieldAnnotation, ExperimentalAnnotation {
private final String KEY_NAME = "Qual_Adjusted_2blod";
private final double CHI_LOD_MAX = -1000.0;
private final SecondBaseSkew skewCalc = new SecondBaseSkew();
private final double log10e = Math.log10(Math.E);
private final double log10half = Math.log10(1.0/2);
public List<String> getKeyNames() { return Arrays.asList(KEY_NAME); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(KEY_NAME, 1, VCFHeaderLineType.Float, "Adjusted residual quality based on second-base skew")); }
public Map<String, Object> annotate( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> contexts, VariantContext vc) {
String chi = skewCalc.getAnnotation(ref, contexts, vc);
if ( chi == null )
return null;
double chi_square = Double.valueOf(chi);
double chi_loglik = chi_square <= 0.0 ? 0.0 : Math.max(-(chi_square/2.0)*log10e + log10half,CHI_LOD_MAX); // cap it...
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%f", 10*(vc.getNegLog10PError() + chi_loglik)));
return map;
}
}

View File

@ -1,115 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broad.tribble.vcf.VCFHeaderLineType;
import org.broad.tribble.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.Arrays;
public class SecondBaseSkew implements InfoFieldAnnotation, ExperimentalAnnotation {
private final static double epsilon = Math.pow(10.0,-12.0);
private final static String KEY_NAME = "2b_Chi";
private final static double[] UNIFORM_ON_OFF_RATIO = {1.0/3.0, 2.0/3.0};
private double[] proportionExpectations = UNIFORM_ON_OFF_RATIO;
public List<String> getKeyNames() { return Arrays.asList(KEY_NAME); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(KEY_NAME, 1, VCFHeaderLineType.Float, "Chi-square Secondary Base Skew")); }
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
return null;
String annotation = getAnnotation(ref, stratifiedContexts, vc);
if ( annotation == null )
return null;
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), annotation);
return map;
}
public String getAnnotation(ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
if ( !vc.isBiallelic() || !vc.isSNP() )
return null;
char alternate = vc.getAlternateAllele(0).toString().charAt(0);
Pair<Integer, Integer> depth = new Pair<Integer, Integer>(0, 0);
for ( String sample : stratifiedContexts.keySet() ) {
//Pair<Integer,Integer> sampleDepth = getSecondaryPileupNonrefCount(ref.getBase(),stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getPileup(), alternate);
Pair<Integer, Integer> sampleDepth = getSecondaryPileupNonrefCount(ref.getBaseAsChar(), stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(), alternate);
depth.first += sampleDepth.first;
depth.second += sampleDepth.second;
}
if ( depth.first == 0 )
return null;
double biasedProportion = (1.0 + depth.second) / (1.0 + depth.first);
double p_transformed = transform(biasedProportion, depth.first+1);
double expected_transformed = transform(proportionExpectations[0], depth.first+1);
double chi_square = Math.signum(biasedProportion - proportionExpectations[0])*Math.min(Math.pow(p_transformed - expected_transformed, 2), Double.MAX_VALUE);
return String.format("%f", chi_square);
}
private double transform( double proportion, int depth ) {
proportion = proportion - epsilon;
return proportion / ( Math.sqrt ( proportion*(1-proportion)/depth ) );
}
private Pair<Integer, Integer> getSecondaryPileupNonrefCount(char ref, ReadBackedPileup p, char snp ) {
int variantDepth = 0;
int variantsWithRefSecondBase = 0;
for (PileupElement pile : p ) {
byte pbase = pile.getBase();
byte sbase = pile.getSecondBase();
if ( BaseUtils.isRegularBase((char)sbase) && BaseUtils.basesAreEqual(pbase, (byte) snp) ) {
variantDepth++;
if ( BaseUtils.basesAreEqual(sbase, (byte)ref) ) {
variantsWithRefSecondBase++;
}
}
}
return new Pair<Integer, Integer>(variantDepth, variantsWithRefSecondBase);
}
}

View File

@ -1,217 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broad.tribble.vcf.VCFGenotypeRecord;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.classloader.PackageUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.io.File;
import java.util.*;
/**
* Determines the concordance between multiple VCF call sets at each position.
* Users can specify which concordance tests should be run.
*/
@Requires(value={DataSource.REFERENCE})
@Reference(window=@Window(start=-20,stop=20))
public class CallsetConcordanceWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="concordance_output", shortName="CO", doc="VCF file to which output should be written", required=true)
private File OUTPUT = null;
@Argument(fullName="concordanceType", shortName="CT", doc="Concordance subset types to apply to given callsets. Syntax: 'type[:key1=arg1,key2=arg2,...]'", required=false)
private String[] TYPES = null;
@Argument(fullName="list", shortName="ls", doc="List the available concordance types and exit", required=false)
private Boolean LIST_ONLY = false;
// the concordance tests to run
private ArrayList<ConcordanceType> requestedTypes;
// VCF writer for the output of the concordance tests
private VCFWriter vcfWriter;
// a map of rod name to uniquified sample name
private HashMap<Pair<String, String>, String> rodNamesToSampleNames = new HashMap<Pair<String, String>, String>();
/**
* Prepare the output file and the list of available features.
*/
public void initialize() {
// get the possible concordance types
List<Class<? extends ConcordanceType>> classes = PackageUtils.getClassesImplementingInterface(ConcordanceType.class);
// print and exit if that's what was requested
if ( LIST_ONLY ) {
out.println("\nAvailable concordance types:");
for (int i = 0; i < classes.size(); i++)
out.println("\t" + classes.get(i).getSimpleName());
out.println();
System.exit(0);
}
// get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap)
HashSet<String> samples = new HashSet<String>();
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames);
for ( java.util.Map.Entry<Pair<String, String>, String> entry : rodNamesToSampleNames.entrySet() ) {
logger.debug("Uniquified sample mapping: " + entry.getKey().first + "/" + entry.getKey().second + " -> " + entry.getValue());
}
// initialize requested concordance types
requestedTypes = new ArrayList<ConcordanceType>();
if (TYPES != null) {
for ( String requestedTypeString : TYPES ) {
String[] requestedPieces = requestedTypeString.split(":");
String requestedType = requestedPieces[0];
boolean foundClass = false;
for ( Class type : classes ) {
if (requestedType.equalsIgnoreCase(type.getSimpleName())) {
foundClass = true;
try {
ConcordanceType concordance = (ConcordanceType)type.newInstance();
HashMap<String,String> requestedArgs = new HashMap<String,String>();
if ( requestedPieces.length == 2 ) {
String[] argStrings = requestedPieces[1].split(",");
for (int i = 0; i < argStrings.length; i++ ) {
String[] arg = argStrings[i].split("=");
if ( arg.length == 2 )
requestedArgs.put(arg[0], arg[1]);
}
}
concordance.initialize(requestedArgs, samples);
requestedTypes.add(concordance);
break;
} catch (InstantiationException e) {
throw new StingException(String.format("Cannot instantiate concordance class '%s': must be concrete class", type.getSimpleName()));
} catch (IllegalAccessException e) {
throw new StingException(String.format("Cannot instantiate concordance class '%s': must have no-arg constructor", type.getSimpleName()));
}
}
}
if ( !foundClass )
throw new StingException("The requested concordance type (" + requestedType + ") isn't a valid concordance option");
}
}
// set up the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "CallsetConcordance"));
hInfo.add(new VCFHeaderLine("note", "\"This file represents a concordance test of various call sets - NOT the output from a multi-sample caller\""));
hInfo.addAll(getVCFAnnotationDescriptions(requestedTypes));
vcfWriter = new VCFWriter(OUTPUT);
vcfWriter.writeHeader(new VCFHeader(hInfo, samples));
}
public static Set<VCFHeaderLine> getVCFAnnotationDescriptions(Collection<ConcordanceType> types) {
TreeSet<VCFHeaderLine> descriptions = new TreeSet<VCFHeaderLine>();
for ( ConcordanceType type : types )
descriptions.add(type.getInfoDescription());
return descriptions;
}
public Integer map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) {
if ( rodData == null ) // RodWalkers can make funky map calls
return 0;
// get all of the vcf rods at this locus
Map<VCFRecord,String> vcfRods = new LinkedHashMap<VCFRecord,String>();
Iterator<GATKFeature> rods = rodData.getAllRods().iterator();
while (rods.hasNext()) {
GATKFeature rod = rods.next();
if ( rod.getUnderlyingObject() instanceof VCFRecord ) {
if (vcfRods.containsKey(rod)) throw new StingException("Duplicate VCF's found");
vcfRods.put((VCFRecord)rod.getUnderlyingObject(),rod.getName());
}
}
if ( vcfRods.size() == 0 )
return 0;
// pull out all of the individual calls from the rods and insert into a map based on the
// mapping from rod/sample to uniquified name
HashMap<String, VCFGenotypeRecord> samplesToRecords = new HashMap<String, VCFGenotypeRecord>();
for ( VCFRecord rod : vcfRods.keySet() ) {
List<VCFGenotypeRecord> records = rod.getVCFGenotypeRecords();
for ( VCFGenotypeRecord vcfRec : records ) {
String uniquifiedSample = rodNamesToSampleNames.get(new Pair<String, String>(vcfRods.get(rod), vcfRec.getSampleName()));
if ( uniquifiedSample == null )
throw new StingException("Unexpected sample encountered: " + vcfRec.getSampleName() + " in rod " + vcfRods.get(rod));
samplesToRecords.put(uniquifiedSample, vcfRec);
}
}
// create a merged record from all input VCFs
VCFRecord record = VCFUtils.mergeRecords(vcfRods, rodNamesToSampleNames);
// add in the info fields to the new record based on the results of each of the relevant concordance tests
for ( ConcordanceType type : requestedTypes ) {
String result = type.computeConcordance(samplesToRecords, ref);
if ( result != null ) {
record.addInfoField(type.getInfoName(), result);
}
}
// emit the new record
vcfWriter.addRecord(record);
return 1;
}
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public void onTraversalDone(Integer result) {
vcfWriter.close();
out.printf("Processed %d loci.\n", result);
}
}

View File

@ -1,16 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broad.tribble.vcf.VCFGenotypeRecord;
import org.broad.tribble.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import java.util.Map;
import java.util.Set;
public interface ConcordanceType {
public void initialize(Map<String,String> args, Set<String> samples);
public String computeConcordance(Map<String, VCFGenotypeRecord> samplesToRecords, ReferenceContext ref);
public String getInfoName();
public VCFInfoHeaderLine getInfoDescription();
}

Some files were not shown because too many files have changed in this diff Show More